diff --git a/.gitignore b/.gitignore
index 4f177c82ae5e0..7ec8d45e12c6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ conf/spark-env.sh
 conf/streaming-env.sh
 conf/log4j.properties
 conf/spark-defaults.conf
+conf/hive-site.xml
 docs/_site
 docs/api
 target/
@@ -50,9 +51,11 @@ unit-tests.log
 rat-results.txt
 scalastyle.txt
 conf/*.conf
+scalastyle-output.xml
 
 # For Hive
 metastore_db/
 metastore/
 warehouse/
 TempStatsStore/
+sql/hive-thriftserver/test_warehouses
diff --git a/.rat-excludes b/.rat-excludes
index 796c32a80896c..bccb043c2bb55 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -22,6 +22,8 @@ slaves
 spark-env.sh
 spark-env.sh.template
 log4j-defaults.properties
+bootstrap-tooltip.js
+jquery-1.11.1.min.js
 sorttable.js
 .*txt
 .*json
@@ -53,3 +55,4 @@ dist/*
 .*ipr
 .*iws
 logs
+.*scalastyle-output.xml
diff --git a/LICENSE b/LICENSE
index 383f079df8c8b..e9a1153fdc5db 100644
--- a/LICENSE
+++ b/LICENSE
@@ -272,7 +272,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 ========================================================================
-For Py4J (python/lib/py4j0.7.egg and files in assembly/lib/net/sf/py4j):
+For Py4J (python/lib/py4j-0.8.2.1-src.zip)
 ========================================================================
 
 Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
@@ -442,7 +442,7 @@ Written by Pavel Binko, Dino Ferrero Merlino, Wolfgang Hoschek, Tony Johnson, An
 
 
 ========================================================================
-Fo SnapTree:
+For SnapTree:
 ========================================================================
 
 SNAPTREE LICENSE
@@ -482,6 +482,24 @@ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.
 
 
+========================================================================
+For Timsort (core/src/main/java/org/apache/spark/util/collection/Sorter.java):
+========================================================================
+Copyright (C) 2008 The Android Open Source Project
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
 ========================================================================
 BSD-style licenses
 ========================================================================
@@ -514,7 +532,7 @@ The following components are provided under a BSD-style license. See project lin
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (ISC/BSD License) jbcrypt (org.mindrot:jbcrypt:0.3m - http://www.mindrot.org/)
 
@@ -531,3 +549,4 @@ The following components are provided under the MIT License. See project link fo
      (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/)
      (MIT License) scopt (com.github.scopt:scopt_2.10:3.2.0 - https://github.com/scopt/scopt)
      (The MIT License) Mockito (org.mockito:mockito-all:1.8.5 - http://www.mockito.org)
+     (MIT License) jquery (https://jquery.org/license/)
diff --git a/README.md b/README.md
index 6211a5889a3f5..f87e07aa5cc90 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,13 @@
 # Apache Spark
 
-Lightning-Fast Cluster Computing - <http://spark.apache.org/>
+Spark is a fast and general cluster computing system for Big Data. It provides
+high-level APIs in Scala, Java, and Python, and an optimized engine that
+supports general computation graphs for data analysis. It also supports a
+rich set of higher-level tools including Spark SQL for SQL and structured
+data processing, MLLib for machine learning, GraphX for graph processing,
+and Spark Streaming.
+
+<http://spark.apache.org/>
 
 
 ## Online Documentation
@@ -69,29 +76,28 @@ can be run using:
 Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported
 storage systems. Because the protocols have changed in different versions of
 Hadoop, you must build Spark against the same version that your cluster runs.
-You can change the version by setting the `SPARK_HADOOP_VERSION` environment
-when building Spark.
+You can change the version by setting `-Dhadoop.version` when building Spark.
 
 For Apache Hadoop versions 1.x, Cloudera CDH MRv1, and other Hadoop
 versions without YARN, use:
 
     # Apache Hadoop 1.2.1
-    $ SPARK_HADOOP_VERSION=1.2.1 sbt/sbt assembly
+    $ sbt/sbt -Dhadoop.version=1.2.1 assembly
 
     # Cloudera CDH 4.2.0 with MapReduce v1
-    $ SPARK_HADOOP_VERSION=2.0.0-mr1-cdh4.2.0 sbt/sbt assembly
+    $ sbt/sbt -Dhadoop.version=2.0.0-mr1-cdh4.2.0 assembly
 
 For Apache Hadoop 2.2.X, 2.1.X, 2.0.X, 0.23.x, Cloudera CDH MRv2, and other Hadoop versions
-with YARN, also set `SPARK_YARN=true`:
+with YARN, also set `-Pyarn`:
 
     # Apache Hadoop 2.0.5-alpha
-    $ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly
+    $ sbt/sbt -Dhadoop.version=2.0.5-alpha -Pyarn assembly
 
     # Cloudera CDH 4.2.0 with MapReduce v2
-    $ SPARK_HADOOP_VERSION=2.0.0-cdh4.2.0 SPARK_YARN=true sbt/sbt assembly
+    $ sbt/sbt -Dhadoop.version=2.0.0-cdh4.2.0 -Pyarn assembly
 
     # Apache Hadoop 2.2.X and newer
-    $ SPARK_HADOOP_VERSION=2.2.0 SPARK_YARN=true sbt/sbt assembly
+    $ sbt/sbt -Dhadoop.version=2.2.0 -Pyarn assembly
 
 When developing a Spark application, specify the Hadoop version by adding the
 "hadoop-client" artifact to your project's dependencies. For example, if you're
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 0c60b66c3daca..703f15925bc44 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -32,12 +32,14 @@
   <packaging>pom</packaging>
 
   <properties>
+    <sbt.project.name>assembly</sbt.project.name>
     <spark.jar.dir>scala-${scala.binary.version}</spark.jar.dir>
     <spark.jar.basename>spark-assembly-${project.version}-hadoop${hadoop.version}.jar</spark.jar.basename>
     <spark.jar>${project.build.directory}/${spark.jar.dir}/${spark.jar.basename}</spark.jar>
     <deb.pkg.name>spark</deb.pkg.name>
     <deb.install.path>/usr/share/spark</deb.install.path>
     <deb.user>root</deb.user>
+    <deb.bin.filemode>744</deb.bin.filemode>
   </properties>
 
   <dependencies>
@@ -163,6 +165,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>hive-thriftserver</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive-thriftserver_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>spark-ganglia-lgpl</id>
       <dependencies>
@@ -275,7 +287,7 @@
                         <user>${deb.user}</user>
                         <group>${deb.user}</group>
                         <prefix>${deb.install.path}/bin</prefix>
-                        <filemode>744</filemode>
+                        <filemode>${deb.bin.filemode}</filemode>
                       </mapper>
                     </data>
                     <data>
diff --git a/bagel/pom.xml b/bagel/pom.xml
index c8e39a415af28..bd51b112e26fa 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-bagel_2.10</artifactId>
+  <properties>
+    <sbt.project.name>bagel</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Bagel</name>
   <url>http://spark.apache.org/</url>
diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
index 70a99b33d753c..ef0bb2ac13f08 100644
--- a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
+++ b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -72,6 +72,7 @@ object Bagel extends Logging {
     var verts = vertices
     var msgs = messages
     var noActivity = false
+    var lastRDD: RDD[(K, (V, Array[M]))] = null
     do {
       logInfo("Starting superstep " + superstep + ".")
       val startTime = System.currentTimeMillis
@@ -83,6 +84,10 @@ object Bagel extends Logging {
       val superstep_ = superstep  // Create a read-only copy of superstep for capture in closure
       val (processed, numMsgs, numActiveVerts) =
         comp[K, V, M, C](sc, grouped, compute(_, _, aggregated, superstep_), storageLevel)
+      if (lastRDD != null) {
+        lastRDD.unpersist(false)
+      }
+      lastRDD = processed
 
       val timeTaken = System.currentTimeMillis - startTime
       logInfo("Superstep %d took %d s".format(superstep, timeTaken / 1000))
diff --git a/bin/beeline b/bin/beeline
new file mode 100755
index 0000000000000..09fe366c609fa
--- /dev/null
+++ b/bin/beeline
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Figure out where Spark is installed
+FWDIR="$(cd `dirname $0`/..; pwd)"
+
+# Find the java binary
+if [ -n "${JAVA_HOME}" ]; then
+  RUNNER="${JAVA_HOME}/bin/java"
+else
+  if [ `command -v java` ]; then
+    RUNNER="java"
+  else
+    echo "JAVA_HOME is not set" >&2
+    exit 1
+  fi
+fi
+
+# Compute classpath using external script
+classpath_output=$($FWDIR/bin/compute-classpath.sh)
+if [[ "$?" != "0" ]]; then
+  echo "$classpath_output"
+  exit 1
+else
+  CLASSPATH=$classpath_output
+fi
+
+CLASS="org.apache.hive.beeline.BeeLine"
+exec "$RUNNER" -cp "$CLASSPATH" $CLASS "$@"
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index e81e8c060cb98..16b794a1592e8 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -52,6 +52,7 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
 fi
 
diff --git a/bin/pyspark b/bin/pyspark
index 69b056fe28f2c..39a20e2a24a3c 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -52,7 +52,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
-export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
+export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP=$PYTHONSTARTUP
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 0ef9eea95342e..2c4b08af8d4c3 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -45,7 +45,7 @@ rem Figure out which Python to use.
 if [%PYSPARK_PYTHON%] == [] set PYSPARK_PYTHON=python
 
 set PYTHONPATH=%FWDIR%python;%PYTHONPATH%
-set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.1-src.zip;%PYTHONPATH%
+set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.2.1-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%FWDIR%python\pyspark\shell.py
diff --git a/bin/run-example b/bin/run-example
index 942706d733122..68a35702eddd3 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -29,7 +29,8 @@ if [ -n "$1" ]; then
 else
   echo "Usage: ./bin/run-example <example-class> [example-args]" 1>&2
   echo "  - set MASTER=XX to use a specific master" 1>&2
-  echo "  - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" 1>&2
+  echo "  - can use abbreviated example class name relative to com.apache.spark.examples" 1>&2
+  echo "     (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)" 1>&2
   exit 1
 fi
 
diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd
index eadedd7fa61ff..b29bf90c64e90 100644
--- a/bin/run-example2.cmd
+++ b/bin/run-example2.cmd
@@ -32,7 +32,8 @@ rem Test that an argument was given
 if not "x%1"=="x" goto arg_given
   echo Usage: run-example ^<example-class^> [example-args]
   echo   - set MASTER=XX to use a specific master
-  echo   - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)
+  echo   - can use abbreviated example class name relative to com.apache.spark.examples
+  echo      (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)
   goto exit
 :arg_given
 
diff --git a/bin/spark-class b/bin/spark-class
index 04fa52c6756b1..3f6beca5becf0 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -110,9 +110,9 @@ export JAVA_OPTS
 
 TOOLS_DIR="$FWDIR"/tools
 SPARK_TOOLS_JAR=""
-if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then
+if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/spark-tools*[0-9Tg].jar ]; then
   # Use the JAR from the SBT build
-  export SPARK_TOOLS_JAR=`ls "$TOOLS_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar`
+  export SPARK_TOOLS_JAR=`ls "$TOOLS_DIR"/target/scala-$SCALA_VERSION/spark-tools*[0-9Tg].jar`
 fi
 if [ -e "$TOOLS_DIR"/target/spark-tools*[0-9Tg].jar ]; then
   # Use the JAR from the Maven build
diff --git a/bin/spark-shell b/bin/spark-shell
index 850e9507ec38f..756c8179d12b6 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -46,11 +46,11 @@ function main(){
         # (see https://github.com/sbt/sbt/issues/562).
         stty -icanon min 1 -echo > /dev/null 2>&1
         export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
-        $FWDIR/bin/spark-submit spark-shell "$@" --class org.apache.spark.repl.Main
+        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@"
         stty icanon echo > /dev/null 2>&1
     else
         export SPARK_SUBMIT_OPTS
-        $FWDIR/bin/spark-submit spark-shell "$@" --class org.apache.spark.repl.Main
+        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@"
     fi
 }
 
diff --git a/bin/spark-shell.cmd b/bin/spark-shell.cmd
index 4b9708a8c03f3..b56d69801171c 100755
--- a/bin/spark-shell.cmd
+++ b/bin/spark-shell.cmd
@@ -19,4 +19,4 @@ rem
 
 set SPARK_HOME=%~dp0..
 
-cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell %* --class org.apache.spark.repl.Main
+cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell --class org.apache.spark.repl.Main %*
diff --git a/bin/spark-sql b/bin/spark-sql
new file mode 100755
index 0000000000000..bba7f897b19bc
--- /dev/null
+++ b/bin/spark-sql
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Shell script for starting the Spark SQL CLI
+
+# Enter posix mode for bash
+set -o posix
+
+# Figure out where Spark is installed
+FWDIR="$(cd `dirname $0`/..; pwd)"
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+  echo "Usage: ./sbin/spark-sql [options]"
+  $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+  exit 0
+fi
+
+CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
+exec "$FWDIR"/bin/spark-submit --class $CLASS spark-internal $@
diff --git a/core/pom.xml b/core/pom.xml
index 6abf8480d5da0..7c60cf10c3dc2 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-core_2.10</artifactId>
+  <properties>
+    <sbt.project.name>core</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Core</name>
   <url>http://spark.apache.org/</url>
@@ -111,6 +114,10 @@
       <groupId>org.xerial.snappy</groupId>
       <artifactId>snappy-java</artifactId>
     </dependency>
+    <dependency>
+      <groupId>net.jpountz.lz4</groupId>
+      <artifactId>lz4</artifactId>
+    </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>chill_${scala.binary.version}</artifactId>
@@ -185,8 +192,8 @@
     </dependency>
     <dependency>
       <groupId>org.tachyonproject</groupId>
-      <artifactId>tachyon</artifactId>
-      <version>0.4.1-thrift</version>
+      <artifactId>tachyon-client</artifactId>
+      <version>0.5.0</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.hadoop</groupId>
@@ -255,6 +262,11 @@
       <artifactId>asm</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
@@ -268,7 +280,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.8.1</version>
+      <version>0.8.2.1</version>
     </dependency>
   </dependencies>
   <build>
diff --git a/core/src/main/java/org/apache/spark/util/collection/Sorter.java b/core/src/main/java/org/apache/spark/util/collection/Sorter.java
new file mode 100644
index 0000000000000..64ad18c0e463a
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/Sorter.java
@@ -0,0 +1,915 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection;
+
+import java.util.Comparator;
+
+/**
+ * A port of the Android Timsort class, which utilizes a "stable, adaptive, iterative mergesort."
+ * See the method comment on sort() for more details.
+ *
+ * This has been kept in Java with the original style in order to match very closely with the
+ * Anroid source code, and thus be easy to verify correctness.
+ *
+ * The purpose of the port is to generalize the interface to the sort to accept input data formats
+ * besides simple arrays where every element is sorted individually. For instance, the AppendOnlyMap
+ * uses this to sort an Array with alternating elements of the form [key, value, key, value].
+ * This generalization comes with minimal overhead -- see SortDataFormat for more information.
+ */
+class Sorter<K, Buffer> {
+
+  /**
+   * This is the minimum sized sequence that will be merged.  Shorter
+   * sequences will be lengthened by calling binarySort.  If the entire
+   * array is less than this length, no merges will be performed.
+   *
+   * This constant should be a power of two.  It was 64 in Tim Peter's C
+   * implementation, but 32 was empirically determined to work better in
+   * this implementation.  In the unlikely event that you set this constant
+   * to be a number that's not a power of two, you'll need to change the
+   * minRunLength computation.
+   *
+   * If you decrease this constant, you must change the stackLen
+   * computation in the TimSort constructor, or you risk an
+   * ArrayOutOfBounds exception.  See listsort.txt for a discussion
+   * of the minimum stack length required as a function of the length
+   * of the array being sorted and the minimum merge sequence length.
+   */
+  private static final int MIN_MERGE = 32;
+
+  private final SortDataFormat<K, Buffer> s;
+
+  public Sorter(SortDataFormat<K, Buffer> sortDataFormat) {
+    this.s = sortDataFormat;
+  }
+
+  /**
+   * A stable, adaptive, iterative mergesort that requires far fewer than
+   * n lg(n) comparisons when running on partially sorted arrays, while
+   * offering performance comparable to a traditional mergesort when run
+   * on random arrays.  Like all proper mergesorts, this sort is stable and
+   * runs O(n log n) time (worst case).  In the worst case, this sort requires
+   * temporary storage space for n/2 object references; in the best case,
+   * it requires only a small constant amount of space.
+   *
+   * This implementation was adapted from Tim Peters's list sort for
+   * Python, which is described in detail here:
+   *
+   *   http://svn.python.org/projects/python/trunk/Objects/listsort.txt
+   *
+   * Tim's C code may be found here:
+   *
+   *   http://svn.python.org/projects/python/trunk/Objects/listobject.c
+   *
+   * The underlying techniques are described in this paper (and may have
+   * even earlier origins):
+   *
+   *  "Optimistic Sorting and Information Theoretic Complexity"
+   *  Peter McIlroy
+   *  SODA (Fourth Annual ACM-SIAM Symposium on Discrete Algorithms),
+   *  pp 467-474, Austin, Texas, 25-27 January 1993.
+   *
+   * While the API to this class consists solely of static methods, it is
+   * (privately) instantiable; a TimSort instance holds the state of an ongoing
+   * sort, assuming the input array is large enough to warrant the full-blown
+   * TimSort. Small arrays are sorted in place, using a binary insertion sort.
+   *
+   * @author Josh Bloch
+   */
+  void sort(Buffer a, int lo, int hi, Comparator<? super K> c) {
+    assert c != null;
+
+    int nRemaining  = hi - lo;
+    if (nRemaining < 2)
+      return;  // Arrays of size 0 and 1 are always sorted
+
+    // If array is small, do a "mini-TimSort" with no merges
+    if (nRemaining < MIN_MERGE) {
+      int initRunLen = countRunAndMakeAscending(a, lo, hi, c);
+      binarySort(a, lo, hi, lo + initRunLen, c);
+      return;
+    }
+
+    /**
+     * March over the array once, left to right, finding natural runs,
+     * extending short natural runs to minRun elements, and merging runs
+     * to maintain stack invariant.
+     */
+    SortState sortState = new SortState(a, c, hi - lo);
+    int minRun = minRunLength(nRemaining);
+    do {
+      // Identify next run
+      int runLen = countRunAndMakeAscending(a, lo, hi, c);
+
+      // If run is short, extend to min(minRun, nRemaining)
+      if (runLen < minRun) {
+        int force = nRemaining <= minRun ? nRemaining : minRun;
+        binarySort(a, lo, lo + force, lo + runLen, c);
+        runLen = force;
+      }
+
+      // Push run onto pending-run stack, and maybe merge
+      sortState.pushRun(lo, runLen);
+      sortState.mergeCollapse();
+
+      // Advance to find next run
+      lo += runLen;
+      nRemaining -= runLen;
+    } while (nRemaining != 0);
+
+    // Merge all remaining runs to complete sort
+    assert lo == hi;
+    sortState.mergeForceCollapse();
+    assert sortState.stackSize == 1;
+  }
+
+  /**
+   * Sorts the specified portion of the specified array using a binary
+   * insertion sort.  This is the best method for sorting small numbers
+   * of elements.  It requires O(n log n) compares, but O(n^2) data
+   * movement (worst case).
+   *
+   * If the initial part of the specified range is already sorted,
+   * this method can take advantage of it: the method assumes that the
+   * elements from index {@code lo}, inclusive, to {@code start},
+   * exclusive are already sorted.
+   *
+   * @param a the array in which a range is to be sorted
+   * @param lo the index of the first element in the range to be sorted
+   * @param hi the index after the last element in the range to be sorted
+   * @param start the index of the first element in the range that is
+   *        not already known to be sorted ({@code lo <= start <= hi})
+   * @param c comparator to used for the sort
+   */
+  @SuppressWarnings("fallthrough")
+  private void binarySort(Buffer a, int lo, int hi, int start, Comparator<? super K> c) {
+    assert lo <= start && start <= hi;
+    if (start == lo)
+      start++;
+
+    Buffer pivotStore = s.allocate(1);
+    for ( ; start < hi; start++) {
+      s.copyElement(a, start, pivotStore, 0);
+      K pivot = s.getKey(pivotStore, 0);
+
+      // Set left (and right) to the index where a[start] (pivot) belongs
+      int left = lo;
+      int right = start;
+      assert left <= right;
+      /*
+       * Invariants:
+       *   pivot >= all in [lo, left).
+       *   pivot <  all in [right, start).
+       */
+      while (left < right) {
+        int mid = (left + right) >>> 1;
+        if (c.compare(pivot, s.getKey(a, mid)) < 0)
+          right = mid;
+        else
+          left = mid + 1;
+      }
+      assert left == right;
+
+      /*
+       * The invariants still hold: pivot >= all in [lo, left) and
+       * pivot < all in [left, start), so pivot belongs at left.  Note
+       * that if there are elements equal to pivot, left points to the
+       * first slot after them -- that's why this sort is stable.
+       * Slide elements over to make room for pivot.
+       */
+      int n = start - left;  // The number of elements to move
+      // Switch is just an optimization for arraycopy in default case
+      switch (n) {
+        case 2:  s.copyElement(a, left + 1, a, left + 2);
+        case 1:  s.copyElement(a, left, a, left + 1);
+          break;
+        default: s.copyRange(a, left, a, left + 1, n);
+      }
+      s.copyElement(pivotStore, 0, a, left);
+    }
+  }
+
+  /**
+   * Returns the length of the run beginning at the specified position in
+   * the specified array and reverses the run if it is descending (ensuring
+   * that the run will always be ascending when the method returns).
+   *
+   * A run is the longest ascending sequence with:
+   *
+   *    a[lo] <= a[lo + 1] <= a[lo + 2] <= ...
+   *
+   * or the longest descending sequence with:
+   *
+   *    a[lo] >  a[lo + 1] >  a[lo + 2] >  ...
+   *
+   * For its intended use in a stable mergesort, the strictness of the
+   * definition of "descending" is needed so that the call can safely
+   * reverse a descending sequence without violating stability.
+   *
+   * @param a the array in which a run is to be counted and possibly reversed
+   * @param lo index of the first element in the run
+   * @param hi index after the last element that may be contained in the run.
+  It is required that {@code lo < hi}.
+   * @param c the comparator to used for the sort
+   * @return  the length of the run beginning at the specified position in
+   *          the specified array
+   */
+  private int countRunAndMakeAscending(Buffer a, int lo, int hi, Comparator<? super K> c) {
+    assert lo < hi;
+    int runHi = lo + 1;
+    if (runHi == hi)
+      return 1;
+
+    // Find end of run, and reverse range if descending
+    if (c.compare(s.getKey(a, runHi++), s.getKey(a, lo)) < 0) { // Descending
+      while (runHi < hi && c.compare(s.getKey(a, runHi), s.getKey(a, runHi - 1)) < 0)
+        runHi++;
+      reverseRange(a, lo, runHi);
+    } else {                              // Ascending
+      while (runHi < hi && c.compare(s.getKey(a, runHi), s.getKey(a, runHi - 1)) >= 0)
+        runHi++;
+    }
+
+    return runHi - lo;
+  }
+
+  /**
+   * Reverse the specified range of the specified array.
+   *
+   * @param a the array in which a range is to be reversed
+   * @param lo the index of the first element in the range to be reversed
+   * @param hi the index after the last element in the range to be reversed
+   */
+  private void reverseRange(Buffer a, int lo, int hi) {
+    hi--;
+    while (lo < hi) {
+      s.swap(a, lo, hi);
+      lo++;
+      hi--;
+    }
+  }
+
+  /**
+   * Returns the minimum acceptable run length for an array of the specified
+   * length. Natural runs shorter than this will be extended with
+   * {@link #binarySort}.
+   *
+   * Roughly speaking, the computation is:
+   *
+   *  If n < MIN_MERGE, return n (it's too small to bother with fancy stuff).
+   *  Else if n is an exact power of 2, return MIN_MERGE/2.
+   *  Else return an int k, MIN_MERGE/2 <= k <= MIN_MERGE, such that n/k
+   *   is close to, but strictly less than, an exact power of 2.
+   *
+   * For the rationale, see listsort.txt.
+   *
+   * @param n the length of the array to be sorted
+   * @return the length of the minimum run to be merged
+   */
+  private int minRunLength(int n) {
+    assert n >= 0;
+    int r = 0;      // Becomes 1 if any 1 bits are shifted off
+    while (n >= MIN_MERGE) {
+      r |= (n & 1);
+      n >>= 1;
+    }
+    return n + r;
+  }
+
+  private class SortState {
+
+    /**
+     * The Buffer being sorted.
+     */
+    private final Buffer a;
+
+    /**
+     * Length of the sort Buffer.
+     */
+    private final int aLength;
+
+    /**
+     * The comparator for this sort.
+     */
+    private final Comparator<? super K> c;
+
+    /**
+     * When we get into galloping mode, we stay there until both runs win less
+     * often than MIN_GALLOP consecutive times.
+     */
+    private static final int  MIN_GALLOP = 7;
+
+    /**
+     * This controls when we get *into* galloping mode.  It is initialized
+     * to MIN_GALLOP.  The mergeLo and mergeHi methods nudge it higher for
+     * random data, and lower for highly structured data.
+     */
+    private int minGallop = MIN_GALLOP;
+
+    /**
+     * Maximum initial size of tmp array, which is used for merging.  The array
+     * can grow to accommodate demand.
+     *
+     * Unlike Tim's original C version, we do not allocate this much storage
+     * when sorting smaller arrays.  This change was required for performance.
+     */
+    private static final int INITIAL_TMP_STORAGE_LENGTH = 256;
+
+    /**
+     * Temp storage for merges.
+     */
+    private Buffer tmp; // Actual runtime type will be Object[], regardless of T
+
+    /**
+     * Length of the temp storage.
+     */
+    private int tmpLength = 0;
+
+    /**
+     * A stack of pending runs yet to be merged.  Run i starts at
+     * address base[i] and extends for len[i] elements.  It's always
+     * true (so long as the indices are in bounds) that:
+     *
+     *     runBase[i] + runLen[i] == runBase[i + 1]
+     *
+     * so we could cut the storage for this, but it's a minor amount,
+     * and keeping all the info explicit simplifies the code.
+     */
+    private int stackSize = 0;  // Number of pending runs on stack
+    private final int[] runBase;
+    private final int[] runLen;
+
+    /**
+     * Creates a TimSort instance to maintain the state of an ongoing sort.
+     *
+     * @param a the array to be sorted
+     * @param c the comparator to determine the order of the sort
+     */
+    private SortState(Buffer a, Comparator<? super K> c, int len) {
+      this.aLength = len;
+      this.a = a;
+      this.c = c;
+
+      // Allocate temp storage (which may be increased later if necessary)
+      tmpLength = len < 2 * INITIAL_TMP_STORAGE_LENGTH ? len >>> 1 : INITIAL_TMP_STORAGE_LENGTH;
+      tmp = s.allocate(tmpLength);
+
+      /*
+       * Allocate runs-to-be-merged stack (which cannot be expanded).  The
+       * stack length requirements are described in listsort.txt.  The C
+       * version always uses the same stack length (85), but this was
+       * measured to be too expensive when sorting "mid-sized" arrays (e.g.,
+       * 100 elements) in Java.  Therefore, we use smaller (but sufficiently
+       * large) stack lengths for smaller arrays.  The "magic numbers" in the
+       * computation below must be changed if MIN_MERGE is decreased.  See
+       * the MIN_MERGE declaration above for more information.
+       */
+      int stackLen = (len <    120  ?  5 :
+                      len <   1542  ? 10 :
+                      len < 119151  ? 19 : 40);
+      runBase = new int[stackLen];
+      runLen = new int[stackLen];
+    }
+
+    /**
+     * Pushes the specified run onto the pending-run stack.
+     *
+     * @param runBase index of the first element in the run
+     * @param runLen  the number of elements in the run
+     */
+    private void pushRun(int runBase, int runLen) {
+      this.runBase[stackSize] = runBase;
+      this.runLen[stackSize] = runLen;
+      stackSize++;
+    }
+
+    /**
+     * Examines the stack of runs waiting to be merged and merges adjacent runs
+     * until the stack invariants are reestablished:
+     *
+     *     1. runLen[i - 3] > runLen[i - 2] + runLen[i - 1]
+     *     2. runLen[i - 2] > runLen[i - 1]
+     *
+     * This method is called each time a new run is pushed onto the stack,
+     * so the invariants are guaranteed to hold for i < stackSize upon
+     * entry to the method.
+     */
+    private void mergeCollapse() {
+      while (stackSize > 1) {
+        int n = stackSize - 2;
+        if (n > 0 && runLen[n-1] <= runLen[n] + runLen[n+1]) {
+          if (runLen[n - 1] < runLen[n + 1])
+            n--;
+          mergeAt(n);
+        } else if (runLen[n] <= runLen[n + 1]) {
+          mergeAt(n);
+        } else {
+          break; // Invariant is established
+        }
+      }
+    }
+
+    /**
+     * Merges all runs on the stack until only one remains.  This method is
+     * called once, to complete the sort.
+     */
+    private void mergeForceCollapse() {
+      while (stackSize > 1) {
+        int n = stackSize - 2;
+        if (n > 0 && runLen[n - 1] < runLen[n + 1])
+          n--;
+        mergeAt(n);
+      }
+    }
+
+    /**
+     * Merges the two runs at stack indices i and i+1.  Run i must be
+     * the penultimate or antepenultimate run on the stack.  In other words,
+     * i must be equal to stackSize-2 or stackSize-3.
+     *
+     * @param i stack index of the first of the two runs to merge
+     */
+    private void mergeAt(int i) {
+      assert stackSize >= 2;
+      assert i >= 0;
+      assert i == stackSize - 2 || i == stackSize - 3;
+
+      int base1 = runBase[i];
+      int len1 = runLen[i];
+      int base2 = runBase[i + 1];
+      int len2 = runLen[i + 1];
+      assert len1 > 0 && len2 > 0;
+      assert base1 + len1 == base2;
+
+      /*
+       * Record the length of the combined runs; if i is the 3rd-last
+       * run now, also slide over the last run (which isn't involved
+       * in this merge).  The current run (i+1) goes away in any case.
+       */
+      runLen[i] = len1 + len2;
+      if (i == stackSize - 3) {
+        runBase[i + 1] = runBase[i + 2];
+        runLen[i + 1] = runLen[i + 2];
+      }
+      stackSize--;
+
+      /*
+       * Find where the first element of run2 goes in run1. Prior elements
+       * in run1 can be ignored (because they're already in place).
+       */
+      int k = gallopRight(s.getKey(a, base2), a, base1, len1, 0, c);
+      assert k >= 0;
+      base1 += k;
+      len1 -= k;
+      if (len1 == 0)
+        return;
+
+      /*
+       * Find where the last element of run1 goes in run2. Subsequent elements
+       * in run2 can be ignored (because they're already in place).
+       */
+      len2 = gallopLeft(s.getKey(a, base1 + len1 - 1), a, base2, len2, len2 - 1, c);
+      assert len2 >= 0;
+      if (len2 == 0)
+        return;
+
+      // Merge remaining runs, using tmp array with min(len1, len2) elements
+      if (len1 <= len2)
+        mergeLo(base1, len1, base2, len2);
+      else
+        mergeHi(base1, len1, base2, len2);
+    }
+
+    /**
+     * Locates the position at which to insert the specified key into the
+     * specified sorted range; if the range contains an element equal to key,
+     * returns the index of the leftmost equal element.
+     *
+     * @param key the key whose insertion point to search for
+     * @param a the array in which to search
+     * @param base the index of the first element in the range
+     * @param len the length of the range; must be > 0
+     * @param hint the index at which to begin the search, 0 <= hint < n.
+     *     The closer hint is to the result, the faster this method will run.
+     * @param c the comparator used to order the range, and to search
+     * @return the int k,  0 <= k <= n such that a[b + k - 1] < key <= a[b + k],
+     *    pretending that a[b - 1] is minus infinity and a[b + n] is infinity.
+     *    In other words, key belongs at index b + k; or in other words,
+     *    the first k elements of a should precede key, and the last n - k
+     *    should follow it.
+     */
+    private int gallopLeft(K key, Buffer a, int base, int len, int hint, Comparator<? super K> c) {
+      assert len > 0 && hint >= 0 && hint < len;
+      int lastOfs = 0;
+      int ofs = 1;
+      if (c.compare(key, s.getKey(a, base + hint)) > 0) {
+        // Gallop right until a[base+hint+lastOfs] < key <= a[base+hint+ofs]
+        int maxOfs = len - hint;
+        while (ofs < maxOfs && c.compare(key, s.getKey(a, base + hint + ofs)) > 0) {
+          lastOfs = ofs;
+          ofs = (ofs << 1) + 1;
+          if (ofs <= 0)   // int overflow
+            ofs = maxOfs;
+        }
+        if (ofs > maxOfs)
+          ofs = maxOfs;
+
+        // Make offsets relative to base
+        lastOfs += hint;
+        ofs += hint;
+      } else { // key <= a[base + hint]
+        // Gallop left until a[base+hint-ofs] < key <= a[base+hint-lastOfs]
+        final int maxOfs = hint + 1;
+        while (ofs < maxOfs && c.compare(key, s.getKey(a, base + hint - ofs)) <= 0) {
+          lastOfs = ofs;
+          ofs = (ofs << 1) + 1;
+          if (ofs <= 0)   // int overflow
+            ofs = maxOfs;
+        }
+        if (ofs > maxOfs)
+          ofs = maxOfs;
+
+        // Make offsets relative to base
+        int tmp = lastOfs;
+        lastOfs = hint - ofs;
+        ofs = hint - tmp;
+      }
+      assert -1 <= lastOfs && lastOfs < ofs && ofs <= len;
+
+      /*
+       * Now a[base+lastOfs] < key <= a[base+ofs], so key belongs somewhere
+       * to the right of lastOfs but no farther right than ofs.  Do a binary
+       * search, with invariant a[base + lastOfs - 1] < key <= a[base + ofs].
+       */
+      lastOfs++;
+      while (lastOfs < ofs) {
+        int m = lastOfs + ((ofs - lastOfs) >>> 1);
+
+        if (c.compare(key, s.getKey(a, base + m)) > 0)
+          lastOfs = m + 1;  // a[base + m] < key
+        else
+          ofs = m;          // key <= a[base + m]
+      }
+      assert lastOfs == ofs;    // so a[base + ofs - 1] < key <= a[base + ofs]
+      return ofs;
+    }
+
+    /**
+     * Like gallopLeft, except that if the range contains an element equal to
+     * key, gallopRight returns the index after the rightmost equal element.
+     *
+     * @param key the key whose insertion point to search for
+     * @param a the array in which to search
+     * @param base the index of the first element in the range
+     * @param len the length of the range; must be > 0
+     * @param hint the index at which to begin the search, 0 <= hint < n.
+     *     The closer hint is to the result, the faster this method will run.
+     * @param c the comparator used to order the range, and to search
+     * @return the int k,  0 <= k <= n such that a[b + k - 1] <= key < a[b + k]
+     */
+    private int gallopRight(K key, Buffer a, int base, int len, int hint, Comparator<? super K> c) {
+      assert len > 0 && hint >= 0 && hint < len;
+
+      int ofs = 1;
+      int lastOfs = 0;
+      if (c.compare(key, s.getKey(a, base + hint)) < 0) {
+        // Gallop left until a[b+hint - ofs] <= key < a[b+hint - lastOfs]
+        int maxOfs = hint + 1;
+        while (ofs < maxOfs && c.compare(key, s.getKey(a, base + hint - ofs)) < 0) {
+          lastOfs = ofs;
+          ofs = (ofs << 1) + 1;
+          if (ofs <= 0)   // int overflow
+            ofs = maxOfs;
+        }
+        if (ofs > maxOfs)
+          ofs = maxOfs;
+
+        // Make offsets relative to b
+        int tmp = lastOfs;
+        lastOfs = hint - ofs;
+        ofs = hint - tmp;
+      } else { // a[b + hint] <= key
+        // Gallop right until a[b+hint + lastOfs] <= key < a[b+hint + ofs]
+        int maxOfs = len - hint;
+        while (ofs < maxOfs && c.compare(key, s.getKey(a, base + hint + ofs)) >= 0) {
+          lastOfs = ofs;
+          ofs = (ofs << 1) + 1;
+          if (ofs <= 0)   // int overflow
+            ofs = maxOfs;
+        }
+        if (ofs > maxOfs)
+          ofs = maxOfs;
+
+        // Make offsets relative to b
+        lastOfs += hint;
+        ofs += hint;
+      }
+      assert -1 <= lastOfs && lastOfs < ofs && ofs <= len;
+
+      /*
+       * Now a[b + lastOfs] <= key < a[b + ofs], so key belongs somewhere to
+       * the right of lastOfs but no farther right than ofs.  Do a binary
+       * search, with invariant a[b + lastOfs - 1] <= key < a[b + ofs].
+       */
+      lastOfs++;
+      while (lastOfs < ofs) {
+        int m = lastOfs + ((ofs - lastOfs) >>> 1);
+
+        if (c.compare(key, s.getKey(a, base + m)) < 0)
+          ofs = m;          // key < a[b + m]
+        else
+          lastOfs = m + 1;  // a[b + m] <= key
+      }
+      assert lastOfs == ofs;    // so a[b + ofs - 1] <= key < a[b + ofs]
+      return ofs;
+    }
+
+    /**
+     * Merges two adjacent runs in place, in a stable fashion.  The first
+     * element of the first run must be greater than the first element of the
+     * second run (a[base1] > a[base2]), and the last element of the first run
+     * (a[base1 + len1-1]) must be greater than all elements of the second run.
+     *
+     * For performance, this method should be called only when len1 <= len2;
+     * its twin, mergeHi should be called if len1 >= len2.  (Either method
+     * may be called if len1 == len2.)
+     *
+     * @param base1 index of first element in first run to be merged
+     * @param len1  length of first run to be merged (must be > 0)
+     * @param base2 index of first element in second run to be merged
+     *        (must be aBase + aLen)
+     * @param len2  length of second run to be merged (must be > 0)
+     */
+    private void mergeLo(int base1, int len1, int base2, int len2) {
+      assert len1 > 0 && len2 > 0 && base1 + len1 == base2;
+
+      // Copy first run into temp array
+      Buffer a = this.a; // For performance
+      Buffer tmp = ensureCapacity(len1);
+      s.copyRange(a, base1, tmp, 0, len1);
+
+      int cursor1 = 0;       // Indexes into tmp array
+      int cursor2 = base2;   // Indexes int a
+      int dest = base1;      // Indexes int a
+
+      // Move first element of second run and deal with degenerate cases
+      s.copyElement(a, cursor2++, a, dest++);
+      if (--len2 == 0) {
+        s.copyRange(tmp, cursor1, a, dest, len1);
+        return;
+      }
+      if (len1 == 1) {
+        s.copyRange(a, cursor2, a, dest, len2);
+        s.copyElement(tmp, cursor1, a, dest + len2); // Last elt of run 1 to end of merge
+        return;
+      }
+
+      Comparator<? super K> c = this.c;  // Use local variable for performance
+      int minGallop = this.minGallop;    //  "    "       "     "      "
+      outer:
+      while (true) {
+        int count1 = 0; // Number of times in a row that first run won
+        int count2 = 0; // Number of times in a row that second run won
+
+        /*
+         * Do the straightforward thing until (if ever) one run starts
+         * winning consistently.
+         */
+        do {
+          assert len1 > 1 && len2 > 0;
+          if (c.compare(s.getKey(a, cursor2), s.getKey(tmp, cursor1)) < 0) {
+            s.copyElement(a, cursor2++, a, dest++);
+            count2++;
+            count1 = 0;
+            if (--len2 == 0)
+              break outer;
+          } else {
+            s.copyElement(tmp, cursor1++, a, dest++);
+            count1++;
+            count2 = 0;
+            if (--len1 == 1)
+              break outer;
+          }
+        } while ((count1 | count2) < minGallop);
+
+        /*
+         * One run is winning so consistently that galloping may be a
+         * huge win. So try that, and continue galloping until (if ever)
+         * neither run appears to be winning consistently anymore.
+         */
+        do {
+          assert len1 > 1 && len2 > 0;
+          count1 = gallopRight(s.getKey(a, cursor2), tmp, cursor1, len1, 0, c);
+          if (count1 != 0) {
+            s.copyRange(tmp, cursor1, a, dest, count1);
+            dest += count1;
+            cursor1 += count1;
+            len1 -= count1;
+            if (len1 <= 1) // len1 == 1 || len1 == 0
+              break outer;
+          }
+          s.copyElement(a, cursor2++, a, dest++);
+          if (--len2 == 0)
+            break outer;
+
+          count2 = gallopLeft(s.getKey(tmp, cursor1), a, cursor2, len2, 0, c);
+          if (count2 != 0) {
+            s.copyRange(a, cursor2, a, dest, count2);
+            dest += count2;
+            cursor2 += count2;
+            len2 -= count2;
+            if (len2 == 0)
+              break outer;
+          }
+          s.copyElement(tmp, cursor1++, a, dest++);
+          if (--len1 == 1)
+            break outer;
+          minGallop--;
+        } while (count1 >= MIN_GALLOP | count2 >= MIN_GALLOP);
+        if (minGallop < 0)
+          minGallop = 0;
+        minGallop += 2;  // Penalize for leaving gallop mode
+      }  // End of "outer" loop
+      this.minGallop = minGallop < 1 ? 1 : minGallop;  // Write back to field
+
+      if (len1 == 1) {
+        assert len2 > 0;
+        s.copyRange(a, cursor2, a, dest, len2);
+        s.copyElement(tmp, cursor1, a, dest + len2); //  Last elt of run 1 to end of merge
+      } else if (len1 == 0) {
+        throw new IllegalArgumentException(
+            "Comparison method violates its general contract!");
+      } else {
+        assert len2 == 0;
+        assert len1 > 1;
+        s.copyRange(tmp, cursor1, a, dest, len1);
+      }
+    }
+
+    /**
+     * Like mergeLo, except that this method should be called only if
+     * len1 >= len2; mergeLo should be called if len1 <= len2.  (Either method
+     * may be called if len1 == len2.)
+     *
+     * @param base1 index of first element in first run to be merged
+     * @param len1  length of first run to be merged (must be > 0)
+     * @param base2 index of first element in second run to be merged
+     *        (must be aBase + aLen)
+     * @param len2  length of second run to be merged (must be > 0)
+     */
+    private void mergeHi(int base1, int len1, int base2, int len2) {
+      assert len1 > 0 && len2 > 0 && base1 + len1 == base2;
+
+      // Copy second run into temp array
+      Buffer a = this.a; // For performance
+      Buffer tmp = ensureCapacity(len2);
+      s.copyRange(a, base2, tmp, 0, len2);
+
+      int cursor1 = base1 + len1 - 1;  // Indexes into a
+      int cursor2 = len2 - 1;          // Indexes into tmp array
+      int dest = base2 + len2 - 1;     // Indexes into a
+
+      // Move last element of first run and deal with degenerate cases
+      s.copyElement(a, cursor1--, a, dest--);
+      if (--len1 == 0) {
+        s.copyRange(tmp, 0, a, dest - (len2 - 1), len2);
+        return;
+      }
+      if (len2 == 1) {
+        dest -= len1;
+        cursor1 -= len1;
+        s.copyRange(a, cursor1 + 1, a, dest + 1, len1);
+        s.copyElement(tmp, cursor2, a, dest);
+        return;
+      }
+
+      Comparator<? super K> c = this.c;  // Use local variable for performance
+      int minGallop = this.minGallop;    //  "    "       "     "      "
+      outer:
+      while (true) {
+        int count1 = 0; // Number of times in a row that first run won
+        int count2 = 0; // Number of times in a row that second run won
+
+        /*
+         * Do the straightforward thing until (if ever) one run
+         * appears to win consistently.
+         */
+        do {
+          assert len1 > 0 && len2 > 1;
+          if (c.compare(s.getKey(tmp, cursor2), s.getKey(a, cursor1)) < 0) {
+            s.copyElement(a, cursor1--, a, dest--);
+            count1++;
+            count2 = 0;
+            if (--len1 == 0)
+              break outer;
+          } else {
+            s.copyElement(tmp, cursor2--, a, dest--);
+            count2++;
+            count1 = 0;
+            if (--len2 == 1)
+              break outer;
+          }
+        } while ((count1 | count2) < minGallop);
+
+        /*
+         * One run is winning so consistently that galloping may be a
+         * huge win. So try that, and continue galloping until (if ever)
+         * neither run appears to be winning consistently anymore.
+         */
+        do {
+          assert len1 > 0 && len2 > 1;
+          count1 = len1 - gallopRight(s.getKey(tmp, cursor2), a, base1, len1, len1 - 1, c);
+          if (count1 != 0) {
+            dest -= count1;
+            cursor1 -= count1;
+            len1 -= count1;
+            s.copyRange(a, cursor1 + 1, a, dest + 1, count1);
+            if (len1 == 0)
+              break outer;
+          }
+          s.copyElement(tmp, cursor2--, a, dest--);
+          if (--len2 == 1)
+            break outer;
+
+          count2 = len2 - gallopLeft(s.getKey(a, cursor1), tmp, 0, len2, len2 - 1, c);
+          if (count2 != 0) {
+            dest -= count2;
+            cursor2 -= count2;
+            len2 -= count2;
+            s.copyRange(tmp, cursor2 + 1, a, dest + 1, count2);
+            if (len2 <= 1)  // len2 == 1 || len2 == 0
+              break outer;
+          }
+          s.copyElement(a, cursor1--, a, dest--);
+          if (--len1 == 0)
+            break outer;
+          minGallop--;
+        } while (count1 >= MIN_GALLOP | count2 >= MIN_GALLOP);
+        if (minGallop < 0)
+          minGallop = 0;
+        minGallop += 2;  // Penalize for leaving gallop mode
+      }  // End of "outer" loop
+      this.minGallop = minGallop < 1 ? 1 : minGallop;  // Write back to field
+
+      if (len2 == 1) {
+        assert len1 > 0;
+        dest -= len1;
+        cursor1 -= len1;
+        s.copyRange(a, cursor1 + 1, a, dest + 1, len1);
+        s.copyElement(tmp, cursor2, a, dest); // Move first elt of run2 to front of merge
+      } else if (len2 == 0) {
+        throw new IllegalArgumentException(
+            "Comparison method violates its general contract!");
+      } else {
+        assert len1 == 0;
+        assert len2 > 0;
+        s.copyRange(tmp, 0, a, dest - (len2 - 1), len2);
+      }
+    }
+
+    /**
+     * Ensures that the external array tmp has at least the specified
+     * number of elements, increasing its size if necessary.  The size
+     * increases exponentially to ensure amortized linear time complexity.
+     *
+     * @param minCapacity the minimum required capacity of the tmp array
+     * @return tmp, whether or not it grew
+     */
+    private Buffer ensureCapacity(int minCapacity) {
+      if (tmpLength < minCapacity) {
+        // Compute smallest power of 2 > minCapacity
+        int newSize = minCapacity;
+        newSize |= newSize >> 1;
+        newSize |= newSize >> 2;
+        newSize |= newSize >> 4;
+        newSize |= newSize >> 8;
+        newSize |= newSize >> 16;
+        newSize++;
+
+        if (newSize < 0) // Not bloody likely!
+          newSize = minCapacity;
+        else
+          newSize = Math.min(newSize, aLength >>> 1);
+
+        tmp = s.allocate(newSize);
+        tmpLength = newSize;
+      }
+      return tmp;
+    }
+  }
+}
diff --git a/core/src/main/resources/org/apache/spark/ui/static/bootstrap-tooltip.js b/core/src/main/resources/org/apache/spark/ui/static/bootstrap-tooltip.js
new file mode 100644
index 0000000000000..2934181c1006a
--- /dev/null
+++ b/core/src/main/resources/org/apache/spark/ui/static/bootstrap-tooltip.js
@@ -0,0 +1,288 @@
+/* ===========================================================
+ * bootstrap-tooltip.js v2.2.2
+ * http://twitter.github.com/bootstrap/javascript.html#tooltips
+ * Inspired by the original jQuery.tipsy by Jason Frame
+ * ===========================================================
+ * Copyright 2012 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ========================================================== */
+
+
+!function ($) {
+
+  "use strict"; // jshint ;_;
+
+
+ /* TOOLTIP PUBLIC CLASS DEFINITION
+  * =============================== */
+
+  var Tooltip = function (element, options) {
+    this.init('tooltip', element, options)
+  }
+
+  Tooltip.prototype = {
+
+    constructor: Tooltip
+
+  , init: function (type, element, options) {
+      var eventIn
+        , eventOut
+
+      this.type = type
+      this.$element = $(element)
+      this.options = this.getOptions(options)
+      this.enabled = true
+
+      if (this.options.trigger == 'click') {
+        this.$element.on('click.' + this.type, this.options.selector, $.proxy(this.toggle, this))
+      } else if (this.options.trigger != 'manual') {
+        eventIn = this.options.trigger == 'hover' ? 'mouseenter' : 'focus'
+        eventOut = this.options.trigger == 'hover' ? 'mouseleave' : 'blur'
+        this.$element.on(eventIn + '.' + this.type, this.options.selector, $.proxy(this.enter, this))
+        this.$element.on(eventOut + '.' + this.type, this.options.selector, $.proxy(this.leave, this))
+      }
+
+      this.options.selector ?
+        (this._options = $.extend({}, this.options, { trigger: 'manual', selector: '' })) :
+        this.fixTitle()
+    }
+
+  , getOptions: function (options) {
+      options = $.extend({}, $.fn[this.type].defaults, options, this.$element.data())
+
+      if (options.delay && typeof options.delay == 'number') {
+        options.delay = {
+          show: options.delay
+        , hide: options.delay
+        }
+      }
+
+      return options
+    }
+
+  , enter: function (e) {
+      var self = $(e.currentTarget)[this.type](this._options).data(this.type)
+
+      if (!self.options.delay || !self.options.delay.show) return self.show()
+
+      clearTimeout(this.timeout)
+      self.hoverState = 'in'
+      this.timeout = setTimeout(function() {
+        if (self.hoverState == 'in') self.show()
+      }, self.options.delay.show)
+    }
+
+  , leave: function (e) {
+      var self = $(e.currentTarget)[this.type](this._options).data(this.type)
+
+      if (this.timeout) clearTimeout(this.timeout)
+      if (!self.options.delay || !self.options.delay.hide) return self.hide()
+
+      self.hoverState = 'out'
+      this.timeout = setTimeout(function() {
+        if (self.hoverState == 'out') self.hide()
+      }, self.options.delay.hide)
+    }
+
+  , show: function () {
+      var $tip
+        , inside
+        , pos
+        , actualWidth
+        , actualHeight
+        , placement
+        , tp
+
+      if (this.hasContent() && this.enabled) {
+        $tip = this.tip()
+        this.setContent()
+
+        if (this.options.animation) {
+          $tip.addClass('fade')
+        }
+
+        placement = typeof this.options.placement == 'function' ?
+          this.options.placement.call(this, $tip[0], this.$element[0]) :
+          this.options.placement
+
+        inside = /in/.test(placement)
+
+        $tip
+          .detach()
+          .css({ top: 0, left: 0, display: 'block' })
+          .insertAfter(this.$element)
+
+        pos = this.getPosition(inside)
+
+        actualWidth = $tip[0].offsetWidth
+        actualHeight = $tip[0].offsetHeight
+
+        switch (inside ? placement.split(' ')[1] : placement) {
+          case 'bottom':
+            tp = {top: pos.top + pos.height, left: pos.left + pos.width / 2 - actualWidth / 2}
+            break
+          case 'top':
+            tp = {top: pos.top - actualHeight, left: pos.left + pos.width / 2 - actualWidth / 2}
+            break
+          case 'left':
+            tp = {top: pos.top + pos.height / 2 - actualHeight / 2, left: pos.left - actualWidth}
+            break
+          case 'right':
+            tp = {top: pos.top + pos.height / 2 - actualHeight / 2, left: pos.left + pos.width}
+            break
+        }
+
+        $tip
+          .offset(tp)
+          .addClass(placement)
+          .addClass('in')
+      }
+    }
+
+  , setContent: function () {
+      var $tip = this.tip()
+        , title = this.getTitle()
+
+      $tip.find('.tooltip-inner')[this.options.html ? 'html' : 'text'](title)
+      $tip.removeClass('fade in top bottom left right')
+    }
+
+  , hide: function () {
+      var that = this
+        , $tip = this.tip()
+
+      $tip.removeClass('in')
+
+      function removeWithAnimation() {
+        var timeout = setTimeout(function () {
+          $tip.off($.support.transition.end).detach()
+        }, 500)
+
+        $tip.one($.support.transition.end, function () {
+          clearTimeout(timeout)
+          $tip.detach()
+        })
+      }
+
+      $.support.transition && this.$tip.hasClass('fade') ?
+        removeWithAnimation() :
+        $tip.detach()
+
+      return this
+    }
+
+  , fixTitle: function () {
+      var $e = this.$element
+      if ($e.attr('title') || typeof($e.attr('data-original-title')) != 'string') {
+        $e.attr('data-original-title', $e.attr('title') || '').attr('title', '')
+      }
+    }
+
+  , hasContent: function () {
+      return this.getTitle()
+    }
+
+  , getPosition: function (inside) {
+      return $.extend({}, (inside ? {top: 0, left: 0} : this.$element.offset()), {
+        width: this.$element[0].offsetWidth
+      , height: this.$element[0].offsetHeight
+      })
+    }
+
+  , getTitle: function () {
+      var title
+        , $e = this.$element
+        , o = this.options
+
+      title = $e.attr('data-original-title')
+        || (typeof o.title == 'function' ? o.title.call($e[0]) :  o.title)
+
+      return title
+    }
+
+  , tip: function () {
+      return this.$tip = this.$tip || $(this.options.template)
+    }
+
+  , validate: function () {
+      if (!this.$element[0].parentNode) {
+        this.hide()
+        this.$element = null
+        this.options = null
+      }
+    }
+
+  , enable: function () {
+      this.enabled = true
+    }
+
+  , disable: function () {
+      this.enabled = false
+    }
+
+  , toggleEnabled: function () {
+      this.enabled = !this.enabled
+    }
+
+  , toggle: function (e) {
+      var self = $(e.currentTarget)[this.type](this._options).data(this.type)
+      self[self.tip().hasClass('in') ? 'hide' : 'show']()
+    }
+
+  , destroy: function () {
+      this.hide().$element.off('.' + this.type).removeData(this.type)
+    }
+
+  }
+
+
+ /* TOOLTIP PLUGIN DEFINITION
+  * ========================= */
+
+  var old = $.fn.tooltip
+
+  $.fn.tooltip = function ( option ) {
+    return this.each(function () {
+      var $this = $(this)
+        , data = $this.data('tooltip')
+        , options = typeof option == 'object' && option
+      if (!data) $this.data('tooltip', (data = new Tooltip(this, options)))
+      if (typeof option == 'string') data[option]()
+    })
+  }
+
+  $.fn.tooltip.Constructor = Tooltip
+
+  $.fn.tooltip.defaults = {
+    animation: true
+  , placement: 'top'
+  , selector: false
+  , template: '<div class="tooltip"><div class="tooltip-arrow"></div><div class="tooltip-inner"></div></div>'
+  , trigger: 'hover'
+  , title: ''
+  , delay: 0
+  , html: false
+  }
+
+
+ /* TOOLTIP NO CONFLICT
+  * =================== */
+
+  $.fn.tooltip.noConflict = function () {
+    $.fn.tooltip = old
+    return this
+  }
+
+}(window.jQuery);
+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BaseRelation.scala b/core/src/main/resources/org/apache/spark/ui/static/initialize-tooltips.js
similarity index 84%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BaseRelation.scala
rename to core/src/main/resources/org/apache/spark/ui/static/initialize-tooltips.js
index 582334aa42590..70f355dfb49cb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BaseRelation.scala
+++ b/core/src/main/resources/org/apache/spark/ui/static/initialize-tooltips.js
@@ -15,10 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.plans.logical
+$(document).ready(function(){
+   $("[data-toggle=tooltip]").tooltip({container: 'body'});
+});
 
-abstract class BaseRelation extends LeafNode {
-  self: Product =>
-
-  def tableName: String
-}
diff --git a/core/src/main/resources/org/apache/spark/ui/static/jquery-1.11.1.min.js b/core/src/main/resources/org/apache/spark/ui/static/jquery-1.11.1.min.js
new file mode 100644
index 0000000000000..ab28a24729b32
--- /dev/null
+++ b/core/src/main/resources/org/apache/spark/ui/static/jquery-1.11.1.min.js
@@ -0,0 +1,4 @@
+/*! jQuery v1.11.1 | (c) 2005, 2014 jQuery Foundation, Inc. | jquery.org/license */
+!function(a,b){"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){var c=[],d=c.slice,e=c.concat,f=c.push,g=c.indexOf,h={},i=h.toString,j=h.hasOwnProperty,k={},l="1.11.1",m=function(a,b){return new m.fn.init(a,b)},n=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,o=/^-ms-/,p=/-([\da-z])/gi,q=function(a,b){return b.toUpperCase()};m.fn=m.prototype={jquery:l,constructor:m,selector:"",length:0,toArray:function(){return d.call(this)},get:function(a){return null!=a?0>a?this[a+this.length]:this[a]:d.call(this)},pushStack:function(a){var b=m.merge(this.constructor(),a);return b.prevObject=this,b.context=this.context,b},each:function(a,b){return m.each(this,a,b)},map:function(a){return this.pushStack(m.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(d.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(0>a?b:0);return this.pushStack(c>=0&&b>c?[this[c]]:[])},end:function(){return this.prevObject||this.constructor(null)},push:f,sort:c.sort,splice:c.splice},m.extend=m.fn.extend=function(){var a,b,c,d,e,f,g=arguments[0]||{},h=1,i=arguments.length,j=!1;for("boolean"==typeof g&&(j=g,g=arguments[h]||{},h++),"object"==typeof g||m.isFunction(g)||(g={}),h===i&&(g=this,h--);i>h;h++)if(null!=(e=arguments[h]))for(d in e)a=g[d],c=e[d],g!==c&&(j&&c&&(m.isPlainObject(c)||(b=m.isArray(c)))?(b?(b=!1,f=a&&m.isArray(a)?a:[]):f=a&&m.isPlainObject(a)?a:{},g[d]=m.extend(j,f,c)):void 0!==c&&(g[d]=c));return g},m.extend({expando:"jQuery"+(l+Math.random()).replace(/\D/g,""),isReady:!0,error:function(a){throw new Error(a)},noop:function(){},isFunction:function(a){return"function"===m.type(a)},isArray:Array.isArray||function(a){return"array"===m.type(a)},isWindow:function(a){return null!=a&&a==a.window},isNumeric:function(a){return!m.isArray(a)&&a-parseFloat(a)>=0},isEmptyObject:function(a){var b;for(b in a)return!1;return!0},isPlainObject:function(a){var b;if(!a||"object"!==m.type(a)||a.nodeType||m.isWindow(a))return!1;try{if(a.constructor&&!j.call(a,"constructor")&&!j.call(a.constructor.prototype,"isPrototypeOf"))return!1}catch(c){return!1}if(k.ownLast)for(b in a)return j.call(a,b);for(b in a);return void 0===b||j.call(a,b)},type:function(a){return null==a?a+"":"object"==typeof a||"function"==typeof a?h[i.call(a)]||"object":typeof a},globalEval:function(b){b&&m.trim(b)&&(a.execScript||function(b){a.eval.call(a,b)})(b)},camelCase:function(a){return a.replace(o,"ms-").replace(p,q)},nodeName:function(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()},each:function(a,b,c){var d,e=0,f=a.length,g=r(a);if(c){if(g){for(;f>e;e++)if(d=b.apply(a[e],c),d===!1)break}else for(e in a)if(d=b.apply(a[e],c),d===!1)break}else if(g){for(;f>e;e++)if(d=b.call(a[e],e,a[e]),d===!1)break}else for(e in a)if(d=b.call(a[e],e,a[e]),d===!1)break;return a},trim:function(a){return null==a?"":(a+"").replace(n,"")},makeArray:function(a,b){var c=b||[];return null!=a&&(r(Object(a))?m.merge(c,"string"==typeof a?[a]:a):f.call(c,a)),c},inArray:function(a,b,c){var d;if(b){if(g)return g.call(b,a,c);for(d=b.length,c=c?0>c?Math.max(0,d+c):c:0;d>c;c++)if(c in b&&b[c]===a)return c}return-1},merge:function(a,b){var c=+b.length,d=0,e=a.length;while(c>d)a[e++]=b[d++];if(c!==c)while(void 0!==b[d])a[e++]=b[d++];return a.length=e,a},grep:function(a,b,c){for(var d,e=[],f=0,g=a.length,h=!c;g>f;f++)d=!b(a[f],f),d!==h&&e.push(a[f]);return e},map:function(a,b,c){var d,f=0,g=a.length,h=r(a),i=[];if(h)for(;g>f;f++)d=b(a[f],f,c),null!=d&&i.push(d);else for(f in a)d=b(a[f],f,c),null!=d&&i.push(d);return e.apply([],i)},guid:1,proxy:function(a,b){var c,e,f;return"string"==typeof b&&(f=a[b],b=a,a=f),m.isFunction(a)?(c=d.call(arguments,2),e=function(){return a.apply(b||this,c.concat(d.call(arguments)))},e.guid=a.guid=a.guid||m.guid++,e):void 0},now:function(){return+new Date},support:k}),m.each("Boolean Number String Function Array Date RegExp Object Error".split(" "),function(a,b){h["[object "+b+"]"]=b.toLowerCase()});function r(a){var b=a.length,c=m.type(a);return"function"===c||m.isWindow(a)?!1:1===a.nodeType&&b?!0:"array"===c||0===b||"number"==typeof b&&b>0&&b-1 in a}var s=function(a){var b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u="sizzle"+-new Date,v=a.document,w=0,x=0,y=gb(),z=gb(),A=gb(),B=function(a,b){return a===b&&(l=!0),0},C="undefined",D=1<<31,E={}.hasOwnProperty,F=[],G=F.pop,H=F.push,I=F.push,J=F.slice,K=F.indexOf||function(a){for(var b=0,c=this.length;c>b;b++)if(this[b]===a)return b;return-1},L="checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped",M="[\\x20\\t\\r\\n\\f]",N="(?:\\\\.|[\\w-]|[^\\x00-\\xa0])+",O=N.replace("w","w#"),P="\\["+M+"*("+N+")(?:"+M+"*([*^$|!~]?=)"+M+"*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|("+O+"))|)"+M+"*\\]",Q=":("+N+")(?:\\((('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|((?:\\\\.|[^\\\\()[\\]]|"+P+")*)|.*)\\)|)",R=new RegExp("^"+M+"+|((?:^|[^\\\\])(?:\\\\.)*)"+M+"+$","g"),S=new RegExp("^"+M+"*,"+M+"*"),T=new RegExp("^"+M+"*([>+~]|"+M+")"+M+"*"),U=new RegExp("="+M+"*([^\\]'\"]*?)"+M+"*\\]","g"),V=new RegExp(Q),W=new RegExp("^"+O+"$"),X={ID:new RegExp("^#("+N+")"),CLASS:new RegExp("^\\.("+N+")"),TAG:new RegExp("^("+N.replace("w","w*")+")"),ATTR:new RegExp("^"+P),PSEUDO:new RegExp("^"+Q),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+L+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/^(?:input|select|textarea|button)$/i,Z=/^h\d$/i,$=/^[^{]+\{\s*\[native \w/,_=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ab=/[+~]/,bb=/'|\\/g,cb=new RegExp("\\\\([\\da-f]{1,6}"+M+"?|("+M+")|.)","ig"),db=function(a,b,c){var d="0x"+b-65536;return d!==d||c?b:0>d?String.fromCharCode(d+65536):String.fromCharCode(d>>10|55296,1023&d|56320)};try{I.apply(F=J.call(v.childNodes),v.childNodes),F[v.childNodes.length].nodeType}catch(eb){I={apply:F.length?function(a,b){H.apply(a,J.call(b))}:function(a,b){var c=a.length,d=0;while(a[c++]=b[d++]);a.length=c-1}}}function fb(a,b,d,e){var f,h,j,k,l,o,r,s,w,x;if((b?b.ownerDocument||b:v)!==n&&m(b),b=b||n,d=d||[],!a||"string"!=typeof a)return d;if(1!==(k=b.nodeType)&&9!==k)return[];if(p&&!e){if(f=_.exec(a))if(j=f[1]){if(9===k){if(h=b.getElementById(j),!h||!h.parentNode)return d;if(h.id===j)return d.push(h),d}else if(b.ownerDocument&&(h=b.ownerDocument.getElementById(j))&&t(b,h)&&h.id===j)return d.push(h),d}else{if(f[2])return I.apply(d,b.getElementsByTagName(a)),d;if((j=f[3])&&c.getElementsByClassName&&b.getElementsByClassName)return I.apply(d,b.getElementsByClassName(j)),d}if(c.qsa&&(!q||!q.test(a))){if(s=r=u,w=b,x=9===k&&a,1===k&&"object"!==b.nodeName.toLowerCase()){o=g(a),(r=b.getAttribute("id"))?s=r.replace(bb,"\\$&"):b.setAttribute("id",s),s="[id='"+s+"'] ",l=o.length;while(l--)o[l]=s+qb(o[l]);w=ab.test(a)&&ob(b.parentNode)||b,x=o.join(",")}if(x)try{return I.apply(d,w.querySelectorAll(x)),d}catch(y){}finally{r||b.removeAttribute("id")}}}return i(a.replace(R,"$1"),b,d,e)}function gb(){var a=[];function b(c,e){return a.push(c+" ")>d.cacheLength&&delete b[a.shift()],b[c+" "]=e}return b}function hb(a){return a[u]=!0,a}function ib(a){var b=n.createElement("div");try{return!!a(b)}catch(c){return!1}finally{b.parentNode&&b.parentNode.removeChild(b),b=null}}function jb(a,b){var c=a.split("|"),e=a.length;while(e--)d.attrHandle[c[e]]=b}function kb(a,b){var c=b&&a,d=c&&1===a.nodeType&&1===b.nodeType&&(~b.sourceIndex||D)-(~a.sourceIndex||D);if(d)return d;if(c)while(c=c.nextSibling)if(c===b)return-1;return a?1:-1}function lb(a){return function(b){var c=b.nodeName.toLowerCase();return"input"===c&&b.type===a}}function mb(a){return function(b){var c=b.nodeName.toLowerCase();return("input"===c||"button"===c)&&b.type===a}}function nb(a){return hb(function(b){return b=+b,hb(function(c,d){var e,f=a([],c.length,b),g=f.length;while(g--)c[e=f[g]]&&(c[e]=!(d[e]=c[e]))})})}function ob(a){return a&&typeof a.getElementsByTagName!==C&&a}c=fb.support={},f=fb.isXML=function(a){var b=a&&(a.ownerDocument||a).documentElement;return b?"HTML"!==b.nodeName:!1},m=fb.setDocument=function(a){var b,e=a?a.ownerDocument||a:v,g=e.defaultView;return e!==n&&9===e.nodeType&&e.documentElement?(n=e,o=e.documentElement,p=!f(e),g&&g!==g.top&&(g.addEventListener?g.addEventListener("unload",function(){m()},!1):g.attachEvent&&g.attachEvent("onunload",function(){m()})),c.attributes=ib(function(a){return a.className="i",!a.getAttribute("className")}),c.getElementsByTagName=ib(function(a){return a.appendChild(e.createComment("")),!a.getElementsByTagName("*").length}),c.getElementsByClassName=$.test(e.getElementsByClassName)&&ib(function(a){return a.innerHTML="<div class='a'></div><div class='a i'></div>",a.firstChild.className="i",2===a.getElementsByClassName("i").length}),c.getById=ib(function(a){return o.appendChild(a).id=u,!e.getElementsByName||!e.getElementsByName(u).length}),c.getById?(d.find.ID=function(a,b){if(typeof b.getElementById!==C&&p){var c=b.getElementById(a);return c&&c.parentNode?[c]:[]}},d.filter.ID=function(a){var b=a.replace(cb,db);return function(a){return a.getAttribute("id")===b}}):(delete d.find.ID,d.filter.ID=function(a){var b=a.replace(cb,db);return function(a){var c=typeof a.getAttributeNode!==C&&a.getAttributeNode("id");return c&&c.value===b}}),d.find.TAG=c.getElementsByTagName?function(a,b){return typeof b.getElementsByTagName!==C?b.getElementsByTagName(a):void 0}:function(a,b){var c,d=[],e=0,f=b.getElementsByTagName(a);if("*"===a){while(c=f[e++])1===c.nodeType&&d.push(c);return d}return f},d.find.CLASS=c.getElementsByClassName&&function(a,b){return typeof b.getElementsByClassName!==C&&p?b.getElementsByClassName(a):void 0},r=[],q=[],(c.qsa=$.test(e.querySelectorAll))&&(ib(function(a){a.innerHTML="<select msallowclip=''><option selected=''></option></select>",a.querySelectorAll("[msallowclip^='']").length&&q.push("[*^$]="+M+"*(?:''|\"\")"),a.querySelectorAll("[selected]").length||q.push("\\["+M+"*(?:value|"+L+")"),a.querySelectorAll(":checked").length||q.push(":checked")}),ib(function(a){var b=e.createElement("input");b.setAttribute("type","hidden"),a.appendChild(b).setAttribute("name","D"),a.querySelectorAll("[name=d]").length&&q.push("name"+M+"*[*^$|!~]?="),a.querySelectorAll(":enabled").length||q.push(":enabled",":disabled"),a.querySelectorAll("*,:x"),q.push(",.*:")})),(c.matchesSelector=$.test(s=o.matches||o.webkitMatchesSelector||o.mozMatchesSelector||o.oMatchesSelector||o.msMatchesSelector))&&ib(function(a){c.disconnectedMatch=s.call(a,"div"),s.call(a,"[s!='']:x"),r.push("!=",Q)}),q=q.length&&new RegExp(q.join("|")),r=r.length&&new RegExp(r.join("|")),b=$.test(o.compareDocumentPosition),t=b||$.test(o.contains)?function(a,b){var c=9===a.nodeType?a.documentElement:a,d=b&&b.parentNode;return a===d||!(!d||1!==d.nodeType||!(c.contains?c.contains(d):a.compareDocumentPosition&&16&a.compareDocumentPosition(d)))}:function(a,b){if(b)while(b=b.parentNode)if(b===a)return!0;return!1},B=b?function(a,b){if(a===b)return l=!0,0;var d=!a.compareDocumentPosition-!b.compareDocumentPosition;return d?d:(d=(a.ownerDocument||a)===(b.ownerDocument||b)?a.compareDocumentPosition(b):1,1&d||!c.sortDetached&&b.compareDocumentPosition(a)===d?a===e||a.ownerDocument===v&&t(v,a)?-1:b===e||b.ownerDocument===v&&t(v,b)?1:k?K.call(k,a)-K.call(k,b):0:4&d?-1:1)}:function(a,b){if(a===b)return l=!0,0;var c,d=0,f=a.parentNode,g=b.parentNode,h=[a],i=[b];if(!f||!g)return a===e?-1:b===e?1:f?-1:g?1:k?K.call(k,a)-K.call(k,b):0;if(f===g)return kb(a,b);c=a;while(c=c.parentNode)h.unshift(c);c=b;while(c=c.parentNode)i.unshift(c);while(h[d]===i[d])d++;return d?kb(h[d],i[d]):h[d]===v?-1:i[d]===v?1:0},e):n},fb.matches=function(a,b){return fb(a,null,null,b)},fb.matchesSelector=function(a,b){if((a.ownerDocument||a)!==n&&m(a),b=b.replace(U,"='$1']"),!(!c.matchesSelector||!p||r&&r.test(b)||q&&q.test(b)))try{var d=s.call(a,b);if(d||c.disconnectedMatch||a.document&&11!==a.document.nodeType)return d}catch(e){}return fb(b,n,null,[a]).length>0},fb.contains=function(a,b){return(a.ownerDocument||a)!==n&&m(a),t(a,b)},fb.attr=function(a,b){(a.ownerDocument||a)!==n&&m(a);var e=d.attrHandle[b.toLowerCase()],f=e&&E.call(d.attrHandle,b.toLowerCase())?e(a,b,!p):void 0;return void 0!==f?f:c.attributes||!p?a.getAttribute(b):(f=a.getAttributeNode(b))&&f.specified?f.value:null},fb.error=function(a){throw new Error("Syntax error, unrecognized expression: "+a)},fb.uniqueSort=function(a){var b,d=[],e=0,f=0;if(l=!c.detectDuplicates,k=!c.sortStable&&a.slice(0),a.sort(B),l){while(b=a[f++])b===a[f]&&(e=d.push(f));while(e--)a.splice(d[e],1)}return k=null,a},e=fb.getText=function(a){var b,c="",d=0,f=a.nodeType;if(f){if(1===f||9===f||11===f){if("string"==typeof a.textContent)return a.textContent;for(a=a.firstChild;a;a=a.nextSibling)c+=e(a)}else if(3===f||4===f)return a.nodeValue}else while(b=a[d++])c+=e(b);return c},d=fb.selectors={cacheLength:50,createPseudo:hb,match:X,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(a){return a[1]=a[1].replace(cb,db),a[3]=(a[3]||a[4]||a[5]||"").replace(cb,db),"~="===a[2]&&(a[3]=" "+a[3]+" "),a.slice(0,4)},CHILD:function(a){return a[1]=a[1].toLowerCase(),"nth"===a[1].slice(0,3)?(a[3]||fb.error(a[0]),a[4]=+(a[4]?a[5]+(a[6]||1):2*("even"===a[3]||"odd"===a[3])),a[5]=+(a[7]+a[8]||"odd"===a[3])):a[3]&&fb.error(a[0]),a},PSEUDO:function(a){var b,c=!a[6]&&a[2];return X.CHILD.test(a[0])?null:(a[3]?a[2]=a[4]||a[5]||"":c&&V.test(c)&&(b=g(c,!0))&&(b=c.indexOf(")",c.length-b)-c.length)&&(a[0]=a[0].slice(0,b),a[2]=c.slice(0,b)),a.slice(0,3))}},filter:{TAG:function(a){var b=a.replace(cb,db).toLowerCase();return"*"===a?function(){return!0}:function(a){return a.nodeName&&a.nodeName.toLowerCase()===b}},CLASS:function(a){var b=y[a+" "];return b||(b=new RegExp("(^|"+M+")"+a+"("+M+"|$)"))&&y(a,function(a){return b.test("string"==typeof a.className&&a.className||typeof a.getAttribute!==C&&a.getAttribute("class")||"")})},ATTR:function(a,b,c){return function(d){var e=fb.attr(d,a);return null==e?"!="===b:b?(e+="","="===b?e===c:"!="===b?e!==c:"^="===b?c&&0===e.indexOf(c):"*="===b?c&&e.indexOf(c)>-1:"$="===b?c&&e.slice(-c.length)===c:"~="===b?(" "+e+" ").indexOf(c)>-1:"|="===b?e===c||e.slice(0,c.length+1)===c+"-":!1):!0}},CHILD:function(a,b,c,d,e){var f="nth"!==a.slice(0,3),g="last"!==a.slice(-4),h="of-type"===b;return 1===d&&0===e?function(a){return!!a.parentNode}:function(b,c,i){var j,k,l,m,n,o,p=f!==g?"nextSibling":"previousSibling",q=b.parentNode,r=h&&b.nodeName.toLowerCase(),s=!i&&!h;if(q){if(f){while(p){l=b;while(l=l[p])if(h?l.nodeName.toLowerCase()===r:1===l.nodeType)return!1;o=p="only"===a&&!o&&"nextSibling"}return!0}if(o=[g?q.firstChild:q.lastChild],g&&s){k=q[u]||(q[u]={}),j=k[a]||[],n=j[0]===w&&j[1],m=j[0]===w&&j[2],l=n&&q.childNodes[n];while(l=++n&&l&&l[p]||(m=n=0)||o.pop())if(1===l.nodeType&&++m&&l===b){k[a]=[w,n,m];break}}else if(s&&(j=(b[u]||(b[u]={}))[a])&&j[0]===w)m=j[1];else while(l=++n&&l&&l[p]||(m=n=0)||o.pop())if((h?l.nodeName.toLowerCase()===r:1===l.nodeType)&&++m&&(s&&((l[u]||(l[u]={}))[a]=[w,m]),l===b))break;return m-=e,m===d||m%d===0&&m/d>=0}}},PSEUDO:function(a,b){var c,e=d.pseudos[a]||d.setFilters[a.toLowerCase()]||fb.error("unsupported pseudo: "+a);return e[u]?e(b):e.length>1?(c=[a,a,"",b],d.setFilters.hasOwnProperty(a.toLowerCase())?hb(function(a,c){var d,f=e(a,b),g=f.length;while(g--)d=K.call(a,f[g]),a[d]=!(c[d]=f[g])}):function(a){return e(a,0,c)}):e}},pseudos:{not:hb(function(a){var b=[],c=[],d=h(a.replace(R,"$1"));return d[u]?hb(function(a,b,c,e){var f,g=d(a,null,e,[]),h=a.length;while(h--)(f=g[h])&&(a[h]=!(b[h]=f))}):function(a,e,f){return b[0]=a,d(b,null,f,c),!c.pop()}}),has:hb(function(a){return function(b){return fb(a,b).length>0}}),contains:hb(function(a){return function(b){return(b.textContent||b.innerText||e(b)).indexOf(a)>-1}}),lang:hb(function(a){return W.test(a||"")||fb.error("unsupported lang: "+a),a=a.replace(cb,db).toLowerCase(),function(b){var c;do if(c=p?b.lang:b.getAttribute("xml:lang")||b.getAttribute("lang"))return c=c.toLowerCase(),c===a||0===c.indexOf(a+"-");while((b=b.parentNode)&&1===b.nodeType);return!1}}),target:function(b){var c=a.location&&a.location.hash;return c&&c.slice(1)===b.id},root:function(a){return a===o},focus:function(a){return a===n.activeElement&&(!n.hasFocus||n.hasFocus())&&!!(a.type||a.href||~a.tabIndex)},enabled:function(a){return a.disabled===!1},disabled:function(a){return a.disabled===!0},checked:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&!!a.checked||"option"===b&&!!a.selected},selected:function(a){return a.parentNode&&a.parentNode.selectedIndex,a.selected===!0},empty:function(a){for(a=a.firstChild;a;a=a.nextSibling)if(a.nodeType<6)return!1;return!0},parent:function(a){return!d.pseudos.empty(a)},header:function(a){return Z.test(a.nodeName)},input:function(a){return Y.test(a.nodeName)},button:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&"button"===a.type||"button"===b},text:function(a){var b;return"input"===a.nodeName.toLowerCase()&&"text"===a.type&&(null==(b=a.getAttribute("type"))||"text"===b.toLowerCase())},first:nb(function(){return[0]}),last:nb(function(a,b){return[b-1]}),eq:nb(function(a,b,c){return[0>c?c+b:c]}),even:nb(function(a,b){for(var c=0;b>c;c+=2)a.push(c);return a}),odd:nb(function(a,b){for(var c=1;b>c;c+=2)a.push(c);return a}),lt:nb(function(a,b,c){for(var d=0>c?c+b:c;--d>=0;)a.push(d);return a}),gt:nb(function(a,b,c){for(var d=0>c?c+b:c;++d<b;)a.push(d);return a})}},d.pseudos.nth=d.pseudos.eq;for(b in{radio:!0,checkbox:!0,file:!0,password:!0,image:!0})d.pseudos[b]=lb(b);for(b in{submit:!0,reset:!0})d.pseudos[b]=mb(b);function pb(){}pb.prototype=d.filters=d.pseudos,d.setFilters=new pb,g=fb.tokenize=function(a,b){var c,e,f,g,h,i,j,k=z[a+" "];if(k)return b?0:k.slice(0);h=a,i=[],j=d.preFilter;while(h){(!c||(e=S.exec(h)))&&(e&&(h=h.slice(e[0].length)||h),i.push(f=[])),c=!1,(e=T.exec(h))&&(c=e.shift(),f.push({value:c,type:e[0].replace(R," ")}),h=h.slice(c.length));for(g in d.filter)!(e=X[g].exec(h))||j[g]&&!(e=j[g](e))||(c=e.shift(),f.push({value:c,type:g,matches:e}),h=h.slice(c.length));if(!c)break}return b?h.length:h?fb.error(a):z(a,i).slice(0)};function qb(a){for(var b=0,c=a.length,d="";c>b;b++)d+=a[b].value;return d}function rb(a,b,c){var d=b.dir,e=c&&"parentNode"===d,f=x++;return b.first?function(b,c,f){while(b=b[d])if(1===b.nodeType||e)return a(b,c,f)}:function(b,c,g){var h,i,j=[w,f];if(g){while(b=b[d])if((1===b.nodeType||e)&&a(b,c,g))return!0}else while(b=b[d])if(1===b.nodeType||e){if(i=b[u]||(b[u]={}),(h=i[d])&&h[0]===w&&h[1]===f)return j[2]=h[2];if(i[d]=j,j[2]=a(b,c,g))return!0}}}function sb(a){return a.length>1?function(b,c,d){var e=a.length;while(e--)if(!a[e](b,c,d))return!1;return!0}:a[0]}function tb(a,b,c){for(var d=0,e=b.length;e>d;d++)fb(a,b[d],c);return c}function ub(a,b,c,d,e){for(var f,g=[],h=0,i=a.length,j=null!=b;i>h;h++)(f=a[h])&&(!c||c(f,d,e))&&(g.push(f),j&&b.push(h));return g}function vb(a,b,c,d,e,f){return d&&!d[u]&&(d=vb(d)),e&&!e[u]&&(e=vb(e,f)),hb(function(f,g,h,i){var j,k,l,m=[],n=[],o=g.length,p=f||tb(b||"*",h.nodeType?[h]:h,[]),q=!a||!f&&b?p:ub(p,m,a,h,i),r=c?e||(f?a:o||d)?[]:g:q;if(c&&c(q,r,h,i),d){j=ub(r,n),d(j,[],h,i),k=j.length;while(k--)(l=j[k])&&(r[n[k]]=!(q[n[k]]=l))}if(f){if(e||a){if(e){j=[],k=r.length;while(k--)(l=r[k])&&j.push(q[k]=l);e(null,r=[],j,i)}k=r.length;while(k--)(l=r[k])&&(j=e?K.call(f,l):m[k])>-1&&(f[j]=!(g[j]=l))}}else r=ub(r===g?r.splice(o,r.length):r),e?e(null,g,r,i):I.apply(g,r)})}function wb(a){for(var b,c,e,f=a.length,g=d.relative[a[0].type],h=g||d.relative[" "],i=g?1:0,k=rb(function(a){return a===b},h,!0),l=rb(function(a){return K.call(b,a)>-1},h,!0),m=[function(a,c,d){return!g&&(d||c!==j)||((b=c).nodeType?k(a,c,d):l(a,c,d))}];f>i;i++)if(c=d.relative[a[i].type])m=[rb(sb(m),c)];else{if(c=d.filter[a[i].type].apply(null,a[i].matches),c[u]){for(e=++i;f>e;e++)if(d.relative[a[e].type])break;return vb(i>1&&sb(m),i>1&&qb(a.slice(0,i-1).concat({value:" "===a[i-2].type?"*":""})).replace(R,"$1"),c,e>i&&wb(a.slice(i,e)),f>e&&wb(a=a.slice(e)),f>e&&qb(a))}m.push(c)}return sb(m)}function xb(a,b){var c=b.length>0,e=a.length>0,f=function(f,g,h,i,k){var l,m,o,p=0,q="0",r=f&&[],s=[],t=j,u=f||e&&d.find.TAG("*",k),v=w+=null==t?1:Math.random()||.1,x=u.length;for(k&&(j=g!==n&&g);q!==x&&null!=(l=u[q]);q++){if(e&&l){m=0;while(o=a[m++])if(o(l,g,h)){i.push(l);break}k&&(w=v)}c&&((l=!o&&l)&&p--,f&&r.push(l))}if(p+=q,c&&q!==p){m=0;while(o=b[m++])o(r,s,g,h);if(f){if(p>0)while(q--)r[q]||s[q]||(s[q]=G.call(i));s=ub(s)}I.apply(i,s),k&&!f&&s.length>0&&p+b.length>1&&fb.uniqueSort(i)}return k&&(w=v,j=t),r};return c?hb(f):f}return h=fb.compile=function(a,b){var c,d=[],e=[],f=A[a+" "];if(!f){b||(b=g(a)),c=b.length;while(c--)f=wb(b[c]),f[u]?d.push(f):e.push(f);f=A(a,xb(e,d)),f.selector=a}return f},i=fb.select=function(a,b,e,f){var i,j,k,l,m,n="function"==typeof a&&a,o=!f&&g(a=n.selector||a);if(e=e||[],1===o.length){if(j=o[0]=o[0].slice(0),j.length>2&&"ID"===(k=j[0]).type&&c.getById&&9===b.nodeType&&p&&d.relative[j[1].type]){if(b=(d.find.ID(k.matches[0].replace(cb,db),b)||[])[0],!b)return e;n&&(b=b.parentNode),a=a.slice(j.shift().value.length)}i=X.needsContext.test(a)?0:j.length;while(i--){if(k=j[i],d.relative[l=k.type])break;if((m=d.find[l])&&(f=m(k.matches[0].replace(cb,db),ab.test(j[0].type)&&ob(b.parentNode)||b))){if(j.splice(i,1),a=f.length&&qb(j),!a)return I.apply(e,f),e;break}}}return(n||h(a,o))(f,b,!p,e,ab.test(a)&&ob(b.parentNode)||b),e},c.sortStable=u.split("").sort(B).join("")===u,c.detectDuplicates=!!l,m(),c.sortDetached=ib(function(a){return 1&a.compareDocumentPosition(n.createElement("div"))}),ib(function(a){return a.innerHTML="<a href='#'></a>","#"===a.firstChild.getAttribute("href")})||jb("type|href|height|width",function(a,b,c){return c?void 0:a.getAttribute(b,"type"===b.toLowerCase()?1:2)}),c.attributes&&ib(function(a){return a.innerHTML="<input/>",a.firstChild.setAttribute("value",""),""===a.firstChild.getAttribute("value")})||jb("value",function(a,b,c){return c||"input"!==a.nodeName.toLowerCase()?void 0:a.defaultValue}),ib(function(a){return null==a.getAttribute("disabled")})||jb(L,function(a,b,c){var d;return c?void 0:a[b]===!0?b.toLowerCase():(d=a.getAttributeNode(b))&&d.specified?d.value:null}),fb}(a);m.find=s,m.expr=s.selectors,m.expr[":"]=m.expr.pseudos,m.unique=s.uniqueSort,m.text=s.getText,m.isXMLDoc=s.isXML,m.contains=s.contains;var t=m.expr.match.needsContext,u=/^<(\w+)\s*\/?>(?:<\/\1>|)$/,v=/^.[^:#\[\.,]*$/;function w(a,b,c){if(m.isFunction(b))return m.grep(a,function(a,d){return!!b.call(a,d,a)!==c});if(b.nodeType)return m.grep(a,function(a){return a===b!==c});if("string"==typeof b){if(v.test(b))return m.filter(b,a,c);b=m.filter(b,a)}return m.grep(a,function(a){return m.inArray(a,b)>=0!==c})}m.filter=function(a,b,c){var d=b[0];return c&&(a=":not("+a+")"),1===b.length&&1===d.nodeType?m.find.matchesSelector(d,a)?[d]:[]:m.find.matches(a,m.grep(b,function(a){return 1===a.nodeType}))},m.fn.extend({find:function(a){var b,c=[],d=this,e=d.length;if("string"!=typeof a)return this.pushStack(m(a).filter(function(){for(b=0;e>b;b++)if(m.contains(d[b],this))return!0}));for(b=0;e>b;b++)m.find(a,d[b],c);return c=this.pushStack(e>1?m.unique(c):c),c.selector=this.selector?this.selector+" "+a:a,c},filter:function(a){return this.pushStack(w(this,a||[],!1))},not:function(a){return this.pushStack(w(this,a||[],!0))},is:function(a){return!!w(this,"string"==typeof a&&t.test(a)?m(a):a||[],!1).length}});var x,y=a.document,z=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]*))$/,A=m.fn.init=function(a,b){var c,d;if(!a)return this;if("string"==typeof a){if(c="<"===a.charAt(0)&&">"===a.charAt(a.length-1)&&a.length>=3?[null,a,null]:z.exec(a),!c||!c[1]&&b)return!b||b.jquery?(b||x).find(a):this.constructor(b).find(a);if(c[1]){if(b=b instanceof m?b[0]:b,m.merge(this,m.parseHTML(c[1],b&&b.nodeType?b.ownerDocument||b:y,!0)),u.test(c[1])&&m.isPlainObject(b))for(c in b)m.isFunction(this[c])?this[c](b[c]):this.attr(c,b[c]);return this}if(d=y.getElementById(c[2]),d&&d.parentNode){if(d.id!==c[2])return x.find(a);this.length=1,this[0]=d}return this.context=y,this.selector=a,this}return a.nodeType?(this.context=this[0]=a,this.length=1,this):m.isFunction(a)?"undefined"!=typeof x.ready?x.ready(a):a(m):(void 0!==a.selector&&(this.selector=a.selector,this.context=a.context),m.makeArray(a,this))};A.prototype=m.fn,x=m(y);var B=/^(?:parents|prev(?:Until|All))/,C={children:!0,contents:!0,next:!0,prev:!0};m.extend({dir:function(a,b,c){var d=[],e=a[b];while(e&&9!==e.nodeType&&(void 0===c||1!==e.nodeType||!m(e).is(c)))1===e.nodeType&&d.push(e),e=e[b];return d},sibling:function(a,b){for(var c=[];a;a=a.nextSibling)1===a.nodeType&&a!==b&&c.push(a);return c}}),m.fn.extend({has:function(a){var b,c=m(a,this),d=c.length;return this.filter(function(){for(b=0;d>b;b++)if(m.contains(this,c[b]))return!0})},closest:function(a,b){for(var c,d=0,e=this.length,f=[],g=t.test(a)||"string"!=typeof a?m(a,b||this.context):0;e>d;d++)for(c=this[d];c&&c!==b;c=c.parentNode)if(c.nodeType<11&&(g?g.index(c)>-1:1===c.nodeType&&m.find.matchesSelector(c,a))){f.push(c);break}return this.pushStack(f.length>1?m.unique(f):f)},index:function(a){return a?"string"==typeof a?m.inArray(this[0],m(a)):m.inArray(a.jquery?a[0]:a,this):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(a,b){return this.pushStack(m.unique(m.merge(this.get(),m(a,b))))},addBack:function(a){return this.add(null==a?this.prevObject:this.prevObject.filter(a))}});function D(a,b){do a=a[b];while(a&&1!==a.nodeType);return a}m.each({parent:function(a){var b=a.parentNode;return b&&11!==b.nodeType?b:null},parents:function(a){return m.dir(a,"parentNode")},parentsUntil:function(a,b,c){return m.dir(a,"parentNode",c)},next:function(a){return D(a,"nextSibling")},prev:function(a){return D(a,"previousSibling")},nextAll:function(a){return m.dir(a,"nextSibling")},prevAll:function(a){return m.dir(a,"previousSibling")},nextUntil:function(a,b,c){return m.dir(a,"nextSibling",c)},prevUntil:function(a,b,c){return m.dir(a,"previousSibling",c)},siblings:function(a){return m.sibling((a.parentNode||{}).firstChild,a)},children:function(a){return m.sibling(a.firstChild)},contents:function(a){return m.nodeName(a,"iframe")?a.contentDocument||a.contentWindow.document:m.merge([],a.childNodes)}},function(a,b){m.fn[a]=function(c,d){var e=m.map(this,b,c);return"Until"!==a.slice(-5)&&(d=c),d&&"string"==typeof d&&(e=m.filter(d,e)),this.length>1&&(C[a]||(e=m.unique(e)),B.test(a)&&(e=e.reverse())),this.pushStack(e)}});var E=/\S+/g,F={};function G(a){var b=F[a]={};return m.each(a.match(E)||[],function(a,c){b[c]=!0}),b}m.Callbacks=function(a){a="string"==typeof a?F[a]||G(a):m.extend({},a);var b,c,d,e,f,g,h=[],i=!a.once&&[],j=function(l){for(c=a.memory&&l,d=!0,f=g||0,g=0,e=h.length,b=!0;h&&e>f;f++)if(h[f].apply(l[0],l[1])===!1&&a.stopOnFalse){c=!1;break}b=!1,h&&(i?i.length&&j(i.shift()):c?h=[]:k.disable())},k={add:function(){if(h){var d=h.length;!function f(b){m.each(b,function(b,c){var d=m.type(c);"function"===d?a.unique&&k.has(c)||h.push(c):c&&c.length&&"string"!==d&&f(c)})}(arguments),b?e=h.length:c&&(g=d,j(c))}return this},remove:function(){return h&&m.each(arguments,function(a,c){var d;while((d=m.inArray(c,h,d))>-1)h.splice(d,1),b&&(e>=d&&e--,f>=d&&f--)}),this},has:function(a){return a?m.inArray(a,h)>-1:!(!h||!h.length)},empty:function(){return h=[],e=0,this},disable:function(){return h=i=c=void 0,this},disabled:function(){return!h},lock:function(){return i=void 0,c||k.disable(),this},locked:function(){return!i},fireWith:function(a,c){return!h||d&&!i||(c=c||[],c=[a,c.slice?c.slice():c],b?i.push(c):j(c)),this},fire:function(){return k.fireWith(this,arguments),this},fired:function(){return!!d}};return k},m.extend({Deferred:function(a){var b=[["resolve","done",m.Callbacks("once memory"),"resolved"],["reject","fail",m.Callbacks("once memory"),"rejected"],["notify","progress",m.Callbacks("memory")]],c="pending",d={state:function(){return c},always:function(){return e.done(arguments).fail(arguments),this},then:function(){var a=arguments;return m.Deferred(function(c){m.each(b,function(b,f){var g=m.isFunction(a[b])&&a[b];e[f[1]](function(){var a=g&&g.apply(this,arguments);a&&m.isFunction(a.promise)?a.promise().done(c.resolve).fail(c.reject).progress(c.notify):c[f[0]+"With"](this===d?c.promise():this,g?[a]:arguments)})}),a=null}).promise()},promise:function(a){return null!=a?m.extend(a,d):d}},e={};return d.pipe=d.then,m.each(b,function(a,f){var g=f[2],h=f[3];d[f[1]]=g.add,h&&g.add(function(){c=h},b[1^a][2].disable,b[2][2].lock),e[f[0]]=function(){return e[f[0]+"With"](this===e?d:this,arguments),this},e[f[0]+"With"]=g.fireWith}),d.promise(e),a&&a.call(e,e),e},when:function(a){var b=0,c=d.call(arguments),e=c.length,f=1!==e||a&&m.isFunction(a.promise)?e:0,g=1===f?a:m.Deferred(),h=function(a,b,c){return function(e){b[a]=this,c[a]=arguments.length>1?d.call(arguments):e,c===i?g.notifyWith(b,c):--f||g.resolveWith(b,c)}},i,j,k;if(e>1)for(i=new Array(e),j=new Array(e),k=new Array(e);e>b;b++)c[b]&&m.isFunction(c[b].promise)?c[b].promise().done(h(b,k,c)).fail(g.reject).progress(h(b,j,i)):--f;return f||g.resolveWith(k,c),g.promise()}});var H;m.fn.ready=function(a){return m.ready.promise().done(a),this},m.extend({isReady:!1,readyWait:1,holdReady:function(a){a?m.readyWait++:m.ready(!0)},ready:function(a){if(a===!0?!--m.readyWait:!m.isReady){if(!y.body)return setTimeout(m.ready);m.isReady=!0,a!==!0&&--m.readyWait>0||(H.resolveWith(y,[m]),m.fn.triggerHandler&&(m(y).triggerHandler("ready"),m(y).off("ready")))}}});function I(){y.addEventListener?(y.removeEventListener("DOMContentLoaded",J,!1),a.removeEventListener("load",J,!1)):(y.detachEvent("onreadystatechange",J),a.detachEvent("onload",J))}function J(){(y.addEventListener||"load"===event.type||"complete"===y.readyState)&&(I(),m.ready())}m.ready.promise=function(b){if(!H)if(H=m.Deferred(),"complete"===y.readyState)setTimeout(m.ready);else if(y.addEventListener)y.addEventListener("DOMContentLoaded",J,!1),a.addEventListener("load",J,!1);else{y.attachEvent("onreadystatechange",J),a.attachEvent("onload",J);var c=!1;try{c=null==a.frameElement&&y.documentElement}catch(d){}c&&c.doScroll&&!function e(){if(!m.isReady){try{c.doScroll("left")}catch(a){return setTimeout(e,50)}I(),m.ready()}}()}return H.promise(b)};var K="undefined",L;for(L in m(k))break;k.ownLast="0"!==L,k.inlineBlockNeedsLayout=!1,m(function(){var a,b,c,d;c=y.getElementsByTagName("body")[0],c&&c.style&&(b=y.createElement("div"),d=y.createElement("div"),d.style.cssText="position:absolute;border:0;width:0;height:0;top:0;left:-9999px",c.appendChild(d).appendChild(b),typeof b.style.zoom!==K&&(b.style.cssText="display:inline;margin:0;border:0;padding:1px;width:1px;zoom:1",k.inlineBlockNeedsLayout=a=3===b.offsetWidth,a&&(c.style.zoom=1)),c.removeChild(d))}),function(){var a=y.createElement("div");if(null==k.deleteExpando){k.deleteExpando=!0;try{delete a.test}catch(b){k.deleteExpando=!1}}a=null}(),m.acceptData=function(a){var b=m.noData[(a.nodeName+" ").toLowerCase()],c=+a.nodeType||1;return 1!==c&&9!==c?!1:!b||b!==!0&&a.getAttribute("classid")===b};var M=/^(?:\{[\w\W]*\}|\[[\w\W]*\])$/,N=/([A-Z])/g;function O(a,b,c){if(void 0===c&&1===a.nodeType){var d="data-"+b.replace(N,"-$1").toLowerCase();if(c=a.getAttribute(d),"string"==typeof c){try{c="true"===c?!0:"false"===c?!1:"null"===c?null:+c+""===c?+c:M.test(c)?m.parseJSON(c):c}catch(e){}m.data(a,b,c)}else c=void 0}return c}function P(a){var b;for(b in a)if(("data"!==b||!m.isEmptyObject(a[b]))&&"toJSON"!==b)return!1;return!0}function Q(a,b,d,e){if(m.acceptData(a)){var f,g,h=m.expando,i=a.nodeType,j=i?m.cache:a,k=i?a[h]:a[h]&&h;
+if(k&&j[k]&&(e||j[k].data)||void 0!==d||"string"!=typeof b)return k||(k=i?a[h]=c.pop()||m.guid++:h),j[k]||(j[k]=i?{}:{toJSON:m.noop}),("object"==typeof b||"function"==typeof b)&&(e?j[k]=m.extend(j[k],b):j[k].data=m.extend(j[k].data,b)),g=j[k],e||(g.data||(g.data={}),g=g.data),void 0!==d&&(g[m.camelCase(b)]=d),"string"==typeof b?(f=g[b],null==f&&(f=g[m.camelCase(b)])):f=g,f}}function R(a,b,c){if(m.acceptData(a)){var d,e,f=a.nodeType,g=f?m.cache:a,h=f?a[m.expando]:m.expando;if(g[h]){if(b&&(d=c?g[h]:g[h].data)){m.isArray(b)?b=b.concat(m.map(b,m.camelCase)):b in d?b=[b]:(b=m.camelCase(b),b=b in d?[b]:b.split(" ")),e=b.length;while(e--)delete d[b[e]];if(c?!P(d):!m.isEmptyObject(d))return}(c||(delete g[h].data,P(g[h])))&&(f?m.cleanData([a],!0):k.deleteExpando||g!=g.window?delete g[h]:g[h]=null)}}}m.extend({cache:{},noData:{"applet ":!0,"embed ":!0,"object ":"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000"},hasData:function(a){return a=a.nodeType?m.cache[a[m.expando]]:a[m.expando],!!a&&!P(a)},data:function(a,b,c){return Q(a,b,c)},removeData:function(a,b){return R(a,b)},_data:function(a,b,c){return Q(a,b,c,!0)},_removeData:function(a,b){return R(a,b,!0)}}),m.fn.extend({data:function(a,b){var c,d,e,f=this[0],g=f&&f.attributes;if(void 0===a){if(this.length&&(e=m.data(f),1===f.nodeType&&!m._data(f,"parsedAttrs"))){c=g.length;while(c--)g[c]&&(d=g[c].name,0===d.indexOf("data-")&&(d=m.camelCase(d.slice(5)),O(f,d,e[d])));m._data(f,"parsedAttrs",!0)}return e}return"object"==typeof a?this.each(function(){m.data(this,a)}):arguments.length>1?this.each(function(){m.data(this,a,b)}):f?O(f,a,m.data(f,a)):void 0},removeData:function(a){return this.each(function(){m.removeData(this,a)})}}),m.extend({queue:function(a,b,c){var d;return a?(b=(b||"fx")+"queue",d=m._data(a,b),c&&(!d||m.isArray(c)?d=m._data(a,b,m.makeArray(c)):d.push(c)),d||[]):void 0},dequeue:function(a,b){b=b||"fx";var c=m.queue(a,b),d=c.length,e=c.shift(),f=m._queueHooks(a,b),g=function(){m.dequeue(a,b)};"inprogress"===e&&(e=c.shift(),d--),e&&("fx"===b&&c.unshift("inprogress"),delete f.stop,e.call(a,g,f)),!d&&f&&f.empty.fire()},_queueHooks:function(a,b){var c=b+"queueHooks";return m._data(a,c)||m._data(a,c,{empty:m.Callbacks("once memory").add(function(){m._removeData(a,b+"queue"),m._removeData(a,c)})})}}),m.fn.extend({queue:function(a,b){var c=2;return"string"!=typeof a&&(b=a,a="fx",c--),arguments.length<c?m.queue(this[0],a):void 0===b?this:this.each(function(){var c=m.queue(this,a,b);m._queueHooks(this,a),"fx"===a&&"inprogress"!==c[0]&&m.dequeue(this,a)})},dequeue:function(a){return this.each(function(){m.dequeue(this,a)})},clearQueue:function(a){return this.queue(a||"fx",[])},promise:function(a,b){var c,d=1,e=m.Deferred(),f=this,g=this.length,h=function(){--d||e.resolveWith(f,[f])};"string"!=typeof a&&(b=a,a=void 0),a=a||"fx";while(g--)c=m._data(f[g],a+"queueHooks"),c&&c.empty&&(d++,c.empty.add(h));return h(),e.promise(b)}});var S=/[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/.source,T=["Top","Right","Bottom","Left"],U=function(a,b){return a=b||a,"none"===m.css(a,"display")||!m.contains(a.ownerDocument,a)},V=m.access=function(a,b,c,d,e,f,g){var h=0,i=a.length,j=null==c;if("object"===m.type(c)){e=!0;for(h in c)m.access(a,b,h,c[h],!0,f,g)}else if(void 0!==d&&(e=!0,m.isFunction(d)||(g=!0),j&&(g?(b.call(a,d),b=null):(j=b,b=function(a,b,c){return j.call(m(a),c)})),b))for(;i>h;h++)b(a[h],c,g?d:d.call(a[h],h,b(a[h],c)));return e?a:j?b.call(a):i?b(a[0],c):f},W=/^(?:checkbox|radio)$/i;!function(){var a=y.createElement("input"),b=y.createElement("div"),c=y.createDocumentFragment();if(b.innerHTML="  <link/><table></table><a href='/a'>a</a><input type='checkbox'/>",k.leadingWhitespace=3===b.firstChild.nodeType,k.tbody=!b.getElementsByTagName("tbody").length,k.htmlSerialize=!!b.getElementsByTagName("link").length,k.html5Clone="<:nav></:nav>"!==y.createElement("nav").cloneNode(!0).outerHTML,a.type="checkbox",a.checked=!0,c.appendChild(a),k.appendChecked=a.checked,b.innerHTML="<textarea>x</textarea>",k.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue,c.appendChild(b),b.innerHTML="<input type='radio' checked='checked' name='t'/>",k.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,k.noCloneEvent=!0,b.attachEvent&&(b.attachEvent("onclick",function(){k.noCloneEvent=!1}),b.cloneNode(!0).click()),null==k.deleteExpando){k.deleteExpando=!0;try{delete b.test}catch(d){k.deleteExpando=!1}}}(),function(){var b,c,d=y.createElement("div");for(b in{submit:!0,change:!0,focusin:!0})c="on"+b,(k[b+"Bubbles"]=c in a)||(d.setAttribute(c,"t"),k[b+"Bubbles"]=d.attributes[c].expando===!1);d=null}();var X=/^(?:input|select|textarea)$/i,Y=/^key/,Z=/^(?:mouse|pointer|contextmenu)|click/,$=/^(?:focusinfocus|focusoutblur)$/,_=/^([^.]*)(?:\.(.+)|)$/;function ab(){return!0}function bb(){return!1}function cb(){try{return y.activeElement}catch(a){}}m.event={global:{},add:function(a,b,c,d,e){var f,g,h,i,j,k,l,n,o,p,q,r=m._data(a);if(r){c.handler&&(i=c,c=i.handler,e=i.selector),c.guid||(c.guid=m.guid++),(g=r.events)||(g=r.events={}),(k=r.handle)||(k=r.handle=function(a){return typeof m===K||a&&m.event.triggered===a.type?void 0:m.event.dispatch.apply(k.elem,arguments)},k.elem=a),b=(b||"").match(E)||[""],h=b.length;while(h--)f=_.exec(b[h])||[],o=q=f[1],p=(f[2]||"").split(".").sort(),o&&(j=m.event.special[o]||{},o=(e?j.delegateType:j.bindType)||o,j=m.event.special[o]||{},l=m.extend({type:o,origType:q,data:d,handler:c,guid:c.guid,selector:e,needsContext:e&&m.expr.match.needsContext.test(e),namespace:p.join(".")},i),(n=g[o])||(n=g[o]=[],n.delegateCount=0,j.setup&&j.setup.call(a,d,p,k)!==!1||(a.addEventListener?a.addEventListener(o,k,!1):a.attachEvent&&a.attachEvent("on"+o,k))),j.add&&(j.add.call(a,l),l.handler.guid||(l.handler.guid=c.guid)),e?n.splice(n.delegateCount++,0,l):n.push(l),m.event.global[o]=!0);a=null}},remove:function(a,b,c,d,e){var f,g,h,i,j,k,l,n,o,p,q,r=m.hasData(a)&&m._data(a);if(r&&(k=r.events)){b=(b||"").match(E)||[""],j=b.length;while(j--)if(h=_.exec(b[j])||[],o=q=h[1],p=(h[2]||"").split(".").sort(),o){l=m.event.special[o]||{},o=(d?l.delegateType:l.bindType)||o,n=k[o]||[],h=h[2]&&new RegExp("(^|\\.)"+p.join("\\.(?:.*\\.|)")+"(\\.|$)"),i=f=n.length;while(f--)g=n[f],!e&&q!==g.origType||c&&c.guid!==g.guid||h&&!h.test(g.namespace)||d&&d!==g.selector&&("**"!==d||!g.selector)||(n.splice(f,1),g.selector&&n.delegateCount--,l.remove&&l.remove.call(a,g));i&&!n.length&&(l.teardown&&l.teardown.call(a,p,r.handle)!==!1||m.removeEvent(a,o,r.handle),delete k[o])}else for(o in k)m.event.remove(a,o+b[j],c,d,!0);m.isEmptyObject(k)&&(delete r.handle,m._removeData(a,"events"))}},trigger:function(b,c,d,e){var f,g,h,i,k,l,n,o=[d||y],p=j.call(b,"type")?b.type:b,q=j.call(b,"namespace")?b.namespace.split("."):[];if(h=l=d=d||y,3!==d.nodeType&&8!==d.nodeType&&!$.test(p+m.event.triggered)&&(p.indexOf(".")>=0&&(q=p.split("."),p=q.shift(),q.sort()),g=p.indexOf(":")<0&&"on"+p,b=b[m.expando]?b:new m.Event(p,"object"==typeof b&&b),b.isTrigger=e?2:3,b.namespace=q.join("."),b.namespace_re=b.namespace?new RegExp("(^|\\.)"+q.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,b.result=void 0,b.target||(b.target=d),c=null==c?[b]:m.makeArray(c,[b]),k=m.event.special[p]||{},e||!k.trigger||k.trigger.apply(d,c)!==!1)){if(!e&&!k.noBubble&&!m.isWindow(d)){for(i=k.delegateType||p,$.test(i+p)||(h=h.parentNode);h;h=h.parentNode)o.push(h),l=h;l===(d.ownerDocument||y)&&o.push(l.defaultView||l.parentWindow||a)}n=0;while((h=o[n++])&&!b.isPropagationStopped())b.type=n>1?i:k.bindType||p,f=(m._data(h,"events")||{})[b.type]&&m._data(h,"handle"),f&&f.apply(h,c),f=g&&h[g],f&&f.apply&&m.acceptData(h)&&(b.result=f.apply(h,c),b.result===!1&&b.preventDefault());if(b.type=p,!e&&!b.isDefaultPrevented()&&(!k._default||k._default.apply(o.pop(),c)===!1)&&m.acceptData(d)&&g&&d[p]&&!m.isWindow(d)){l=d[g],l&&(d[g]=null),m.event.triggered=p;try{d[p]()}catch(r){}m.event.triggered=void 0,l&&(d[g]=l)}return b.result}},dispatch:function(a){a=m.event.fix(a);var b,c,e,f,g,h=[],i=d.call(arguments),j=(m._data(this,"events")||{})[a.type]||[],k=m.event.special[a.type]||{};if(i[0]=a,a.delegateTarget=this,!k.preDispatch||k.preDispatch.call(this,a)!==!1){h=m.event.handlers.call(this,a,j),b=0;while((f=h[b++])&&!a.isPropagationStopped()){a.currentTarget=f.elem,g=0;while((e=f.handlers[g++])&&!a.isImmediatePropagationStopped())(!a.namespace_re||a.namespace_re.test(e.namespace))&&(a.handleObj=e,a.data=e.data,c=((m.event.special[e.origType]||{}).handle||e.handler).apply(f.elem,i),void 0!==c&&(a.result=c)===!1&&(a.preventDefault(),a.stopPropagation()))}return k.postDispatch&&k.postDispatch.call(this,a),a.result}},handlers:function(a,b){var c,d,e,f,g=[],h=b.delegateCount,i=a.target;if(h&&i.nodeType&&(!a.button||"click"!==a.type))for(;i!=this;i=i.parentNode||this)if(1===i.nodeType&&(i.disabled!==!0||"click"!==a.type)){for(e=[],f=0;h>f;f++)d=b[f],c=d.selector+" ",void 0===e[c]&&(e[c]=d.needsContext?m(c,this).index(i)>=0:m.find(c,this,null,[i]).length),e[c]&&e.push(d);e.length&&g.push({elem:i,handlers:e})}return h<b.length&&g.push({elem:this,handlers:b.slice(h)}),g},fix:function(a){if(a[m.expando])return a;var b,c,d,e=a.type,f=a,g=this.fixHooks[e];g||(this.fixHooks[e]=g=Z.test(e)?this.mouseHooks:Y.test(e)?this.keyHooks:{}),d=g.props?this.props.concat(g.props):this.props,a=new m.Event(f),b=d.length;while(b--)c=d[b],a[c]=f[c];return a.target||(a.target=f.srcElement||y),3===a.target.nodeType&&(a.target=a.target.parentNode),a.metaKey=!!a.metaKey,g.filter?g.filter(a,f):a},props:"altKey bubbles cancelable ctrlKey currentTarget eventPhase metaKey relatedTarget shiftKey target timeStamp view which".split(" "),fixHooks:{},keyHooks:{props:"char charCode key keyCode".split(" "),filter:function(a,b){return null==a.which&&(a.which=null!=b.charCode?b.charCode:b.keyCode),a}},mouseHooks:{props:"button buttons clientX clientY fromElement offsetX offsetY pageX pageY screenX screenY toElement".split(" "),filter:function(a,b){var c,d,e,f=b.button,g=b.fromElement;return null==a.pageX&&null!=b.clientX&&(d=a.target.ownerDocument||y,e=d.documentElement,c=d.body,a.pageX=b.clientX+(e&&e.scrollLeft||c&&c.scrollLeft||0)-(e&&e.clientLeft||c&&c.clientLeft||0),a.pageY=b.clientY+(e&&e.scrollTop||c&&c.scrollTop||0)-(e&&e.clientTop||c&&c.clientTop||0)),!a.relatedTarget&&g&&(a.relatedTarget=g===a.target?b.toElement:g),a.which||void 0===f||(a.which=1&f?1:2&f?3:4&f?2:0),a}},special:{load:{noBubble:!0},focus:{trigger:function(){if(this!==cb()&&this.focus)try{return this.focus(),!1}catch(a){}},delegateType:"focusin"},blur:{trigger:function(){return this===cb()&&this.blur?(this.blur(),!1):void 0},delegateType:"focusout"},click:{trigger:function(){return m.nodeName(this,"input")&&"checkbox"===this.type&&this.click?(this.click(),!1):void 0},_default:function(a){return m.nodeName(a.target,"a")}},beforeunload:{postDispatch:function(a){void 0!==a.result&&a.originalEvent&&(a.originalEvent.returnValue=a.result)}}},simulate:function(a,b,c,d){var e=m.extend(new m.Event,c,{type:a,isSimulated:!0,originalEvent:{}});d?m.event.trigger(e,null,b):m.event.dispatch.call(b,e),e.isDefaultPrevented()&&c.preventDefault()}},m.removeEvent=y.removeEventListener?function(a,b,c){a.removeEventListener&&a.removeEventListener(b,c,!1)}:function(a,b,c){var d="on"+b;a.detachEvent&&(typeof a[d]===K&&(a[d]=null),a.detachEvent(d,c))},m.Event=function(a,b){return this instanceof m.Event?(a&&a.type?(this.originalEvent=a,this.type=a.type,this.isDefaultPrevented=a.defaultPrevented||void 0===a.defaultPrevented&&a.returnValue===!1?ab:bb):this.type=a,b&&m.extend(this,b),this.timeStamp=a&&a.timeStamp||m.now(),void(this[m.expando]=!0)):new m.Event(a,b)},m.Event.prototype={isDefaultPrevented:bb,isPropagationStopped:bb,isImmediatePropagationStopped:bb,preventDefault:function(){var a=this.originalEvent;this.isDefaultPrevented=ab,a&&(a.preventDefault?a.preventDefault():a.returnValue=!1)},stopPropagation:function(){var a=this.originalEvent;this.isPropagationStopped=ab,a&&(a.stopPropagation&&a.stopPropagation(),a.cancelBubble=!0)},stopImmediatePropagation:function(){var a=this.originalEvent;this.isImmediatePropagationStopped=ab,a&&a.stopImmediatePropagation&&a.stopImmediatePropagation(),this.stopPropagation()}},m.each({mouseenter:"mouseover",mouseleave:"mouseout",pointerenter:"pointerover",pointerleave:"pointerout"},function(a,b){m.event.special[a]={delegateType:b,bindType:b,handle:function(a){var c,d=this,e=a.relatedTarget,f=a.handleObj;return(!e||e!==d&&!m.contains(d,e))&&(a.type=f.origType,c=f.handler.apply(this,arguments),a.type=b),c}}}),k.submitBubbles||(m.event.special.submit={setup:function(){return m.nodeName(this,"form")?!1:void m.event.add(this,"click._submit keypress._submit",function(a){var b=a.target,c=m.nodeName(b,"input")||m.nodeName(b,"button")?b.form:void 0;c&&!m._data(c,"submitBubbles")&&(m.event.add(c,"submit._submit",function(a){a._submit_bubble=!0}),m._data(c,"submitBubbles",!0))})},postDispatch:function(a){a._submit_bubble&&(delete a._submit_bubble,this.parentNode&&!a.isTrigger&&m.event.simulate("submit",this.parentNode,a,!0))},teardown:function(){return m.nodeName(this,"form")?!1:void m.event.remove(this,"._submit")}}),k.changeBubbles||(m.event.special.change={setup:function(){return X.test(this.nodeName)?(("checkbox"===this.type||"radio"===this.type)&&(m.event.add(this,"propertychange._change",function(a){"checked"===a.originalEvent.propertyName&&(this._just_changed=!0)}),m.event.add(this,"click._change",function(a){this._just_changed&&!a.isTrigger&&(this._just_changed=!1),m.event.simulate("change",this,a,!0)})),!1):void m.event.add(this,"beforeactivate._change",function(a){var b=a.target;X.test(b.nodeName)&&!m._data(b,"changeBubbles")&&(m.event.add(b,"change._change",function(a){!this.parentNode||a.isSimulated||a.isTrigger||m.event.simulate("change",this.parentNode,a,!0)}),m._data(b,"changeBubbles",!0))})},handle:function(a){var b=a.target;return this!==b||a.isSimulated||a.isTrigger||"radio"!==b.type&&"checkbox"!==b.type?a.handleObj.handler.apply(this,arguments):void 0},teardown:function(){return m.event.remove(this,"._change"),!X.test(this.nodeName)}}),k.focusinBubbles||m.each({focus:"focusin",blur:"focusout"},function(a,b){var c=function(a){m.event.simulate(b,a.target,m.event.fix(a),!0)};m.event.special[b]={setup:function(){var d=this.ownerDocument||this,e=m._data(d,b);e||d.addEventListener(a,c,!0),m._data(d,b,(e||0)+1)},teardown:function(){var d=this.ownerDocument||this,e=m._data(d,b)-1;e?m._data(d,b,e):(d.removeEventListener(a,c,!0),m._removeData(d,b))}}}),m.fn.extend({on:function(a,b,c,d,e){var f,g;if("object"==typeof a){"string"!=typeof b&&(c=c||b,b=void 0);for(f in a)this.on(f,b,c,a[f],e);return this}if(null==c&&null==d?(d=b,c=b=void 0):null==d&&("string"==typeof b?(d=c,c=void 0):(d=c,c=b,b=void 0)),d===!1)d=bb;else if(!d)return this;return 1===e&&(g=d,d=function(a){return m().off(a),g.apply(this,arguments)},d.guid=g.guid||(g.guid=m.guid++)),this.each(function(){m.event.add(this,a,d,c,b)})},one:function(a,b,c,d){return this.on(a,b,c,d,1)},off:function(a,b,c){var d,e;if(a&&a.preventDefault&&a.handleObj)return d=a.handleObj,m(a.delegateTarget).off(d.namespace?d.origType+"."+d.namespace:d.origType,d.selector,d.handler),this;if("object"==typeof a){for(e in a)this.off(e,b,a[e]);return this}return(b===!1||"function"==typeof b)&&(c=b,b=void 0),c===!1&&(c=bb),this.each(function(){m.event.remove(this,a,c,b)})},trigger:function(a,b){return this.each(function(){m.event.trigger(a,b,this)})},triggerHandler:function(a,b){var c=this[0];return c?m.event.trigger(a,b,c,!0):void 0}});function db(a){var b=eb.split("|"),c=a.createDocumentFragment();if(c.createElement)while(b.length)c.createElement(b.pop());return c}var eb="abbr|article|aside|audio|bdi|canvas|data|datalist|details|figcaption|figure|footer|header|hgroup|mark|meter|nav|output|progress|section|summary|time|video",fb=/ jQuery\d+="(?:null|\d+)"/g,gb=new RegExp("<(?:"+eb+")[\\s/>]","i"),hb=/^\s+/,ib=/<(?!area|br|col|embed|hr|img|input|link|meta|param)(([\w:]+)[^>]*)\/>/gi,jb=/<([\w:]+)/,kb=/<tbody/i,lb=/<|&#?\w+;/,mb=/<(?:script|style|link)/i,nb=/checked\s*(?:[^=]|=\s*.checked.)/i,ob=/^$|\/(?:java|ecma)script/i,pb=/^true\/(.*)/,qb=/^\s*<!(?:\[CDATA\[|--)|(?:\]\]|--)>\s*$/g,rb={option:[1,"<select multiple='multiple'>","</select>"],legend:[1,"<fieldset>","</fieldset>"],area:[1,"<map>","</map>"],param:[1,"<object>","</object>"],thead:[1,"<table>","</table>"],tr:[2,"<table><tbody>","</tbody></table>"],col:[2,"<table><tbody></tbody><colgroup>","</colgroup></table>"],td:[3,"<table><tbody><tr>","</tr></tbody></table>"],_default:k.htmlSerialize?[0,"",""]:[1,"X<div>","</div>"]},sb=db(y),tb=sb.appendChild(y.createElement("div"));rb.optgroup=rb.option,rb.tbody=rb.tfoot=rb.colgroup=rb.caption=rb.thead,rb.th=rb.td;function ub(a,b){var c,d,e=0,f=typeof a.getElementsByTagName!==K?a.getElementsByTagName(b||"*"):typeof a.querySelectorAll!==K?a.querySelectorAll(b||"*"):void 0;if(!f)for(f=[],c=a.childNodes||a;null!=(d=c[e]);e++)!b||m.nodeName(d,b)?f.push(d):m.merge(f,ub(d,b));return void 0===b||b&&m.nodeName(a,b)?m.merge([a],f):f}function vb(a){W.test(a.type)&&(a.defaultChecked=a.checked)}function wb(a,b){return m.nodeName(a,"table")&&m.nodeName(11!==b.nodeType?b:b.firstChild,"tr")?a.getElementsByTagName("tbody")[0]||a.appendChild(a.ownerDocument.createElement("tbody")):a}function xb(a){return a.type=(null!==m.find.attr(a,"type"))+"/"+a.type,a}function yb(a){var b=pb.exec(a.type);return b?a.type=b[1]:a.removeAttribute("type"),a}function zb(a,b){for(var c,d=0;null!=(c=a[d]);d++)m._data(c,"globalEval",!b||m._data(b[d],"globalEval"))}function Ab(a,b){if(1===b.nodeType&&m.hasData(a)){var c,d,e,f=m._data(a),g=m._data(b,f),h=f.events;if(h){delete g.handle,g.events={};for(c in h)for(d=0,e=h[c].length;e>d;d++)m.event.add(b,c,h[c][d])}g.data&&(g.data=m.extend({},g.data))}}function Bb(a,b){var c,d,e;if(1===b.nodeType){if(c=b.nodeName.toLowerCase(),!k.noCloneEvent&&b[m.expando]){e=m._data(b);for(d in e.events)m.removeEvent(b,d,e.handle);b.removeAttribute(m.expando)}"script"===c&&b.text!==a.text?(xb(b).text=a.text,yb(b)):"object"===c?(b.parentNode&&(b.outerHTML=a.outerHTML),k.html5Clone&&a.innerHTML&&!m.trim(b.innerHTML)&&(b.innerHTML=a.innerHTML)):"input"===c&&W.test(a.type)?(b.defaultChecked=b.checked=a.checked,b.value!==a.value&&(b.value=a.value)):"option"===c?b.defaultSelected=b.selected=a.defaultSelected:("input"===c||"textarea"===c)&&(b.defaultValue=a.defaultValue)}}m.extend({clone:function(a,b,c){var d,e,f,g,h,i=m.contains(a.ownerDocument,a);if(k.html5Clone||m.isXMLDoc(a)||!gb.test("<"+a.nodeName+">")?f=a.cloneNode(!0):(tb.innerHTML=a.outerHTML,tb.removeChild(f=tb.firstChild)),!(k.noCloneEvent&&k.noCloneChecked||1!==a.nodeType&&11!==a.nodeType||m.isXMLDoc(a)))for(d=ub(f),h=ub(a),g=0;null!=(e=h[g]);++g)d[g]&&Bb(e,d[g]);if(b)if(c)for(h=h||ub(a),d=d||ub(f),g=0;null!=(e=h[g]);g++)Ab(e,d[g]);else Ab(a,f);return d=ub(f,"script"),d.length>0&&zb(d,!i&&ub(a,"script")),d=h=e=null,f},buildFragment:function(a,b,c,d){for(var e,f,g,h,i,j,l,n=a.length,o=db(b),p=[],q=0;n>q;q++)if(f=a[q],f||0===f)if("object"===m.type(f))m.merge(p,f.nodeType?[f]:f);else if(lb.test(f)){h=h||o.appendChild(b.createElement("div")),i=(jb.exec(f)||["",""])[1].toLowerCase(),l=rb[i]||rb._default,h.innerHTML=l[1]+f.replace(ib,"<$1></$2>")+l[2],e=l[0];while(e--)h=h.lastChild;if(!k.leadingWhitespace&&hb.test(f)&&p.push(b.createTextNode(hb.exec(f)[0])),!k.tbody){f="table"!==i||kb.test(f)?"<table>"!==l[1]||kb.test(f)?0:h:h.firstChild,e=f&&f.childNodes.length;while(e--)m.nodeName(j=f.childNodes[e],"tbody")&&!j.childNodes.length&&f.removeChild(j)}m.merge(p,h.childNodes),h.textContent="";while(h.firstChild)h.removeChild(h.firstChild);h=o.lastChild}else p.push(b.createTextNode(f));h&&o.removeChild(h),k.appendChecked||m.grep(ub(p,"input"),vb),q=0;while(f=p[q++])if((!d||-1===m.inArray(f,d))&&(g=m.contains(f.ownerDocument,f),h=ub(o.appendChild(f),"script"),g&&zb(h),c)){e=0;while(f=h[e++])ob.test(f.type||"")&&c.push(f)}return h=null,o},cleanData:function(a,b){for(var d,e,f,g,h=0,i=m.expando,j=m.cache,l=k.deleteExpando,n=m.event.special;null!=(d=a[h]);h++)if((b||m.acceptData(d))&&(f=d[i],g=f&&j[f])){if(g.events)for(e in g.events)n[e]?m.event.remove(d,e):m.removeEvent(d,e,g.handle);j[f]&&(delete j[f],l?delete d[i]:typeof d.removeAttribute!==K?d.removeAttribute(i):d[i]=null,c.push(f))}}}),m.fn.extend({text:function(a){return V(this,function(a){return void 0===a?m.text(this):this.empty().append((this[0]&&this[0].ownerDocument||y).createTextNode(a))},null,a,arguments.length)},append:function(){return this.domManip(arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=wb(this,a);b.appendChild(a)}})},prepend:function(){return this.domManip(arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=wb(this,a);b.insertBefore(a,b.firstChild)}})},before:function(){return this.domManip(arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this)})},after:function(){return this.domManip(arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this.nextSibling)})},remove:function(a,b){for(var c,d=a?m.filter(a,this):this,e=0;null!=(c=d[e]);e++)b||1!==c.nodeType||m.cleanData(ub(c)),c.parentNode&&(b&&m.contains(c.ownerDocument,c)&&zb(ub(c,"script")),c.parentNode.removeChild(c));return this},empty:function(){for(var a,b=0;null!=(a=this[b]);b++){1===a.nodeType&&m.cleanData(ub(a,!1));while(a.firstChild)a.removeChild(a.firstChild);a.options&&m.nodeName(a,"select")&&(a.options.length=0)}return this},clone:function(a,b){return a=null==a?!1:a,b=null==b?a:b,this.map(function(){return m.clone(this,a,b)})},html:function(a){return V(this,function(a){var b=this[0]||{},c=0,d=this.length;if(void 0===a)return 1===b.nodeType?b.innerHTML.replace(fb,""):void 0;if(!("string"!=typeof a||mb.test(a)||!k.htmlSerialize&&gb.test(a)||!k.leadingWhitespace&&hb.test(a)||rb[(jb.exec(a)||["",""])[1].toLowerCase()])){a=a.replace(ib,"<$1></$2>");try{for(;d>c;c++)b=this[c]||{},1===b.nodeType&&(m.cleanData(ub(b,!1)),b.innerHTML=a);b=0}catch(e){}}b&&this.empty().append(a)},null,a,arguments.length)},replaceWith:function(){var a=arguments[0];return this.domManip(arguments,function(b){a=this.parentNode,m.cleanData(ub(this)),a&&a.replaceChild(b,this)}),a&&(a.length||a.nodeType)?this:this.remove()},detach:function(a){return this.remove(a,!0)},domManip:function(a,b){a=e.apply([],a);var c,d,f,g,h,i,j=0,l=this.length,n=this,o=l-1,p=a[0],q=m.isFunction(p);if(q||l>1&&"string"==typeof p&&!k.checkClone&&nb.test(p))return this.each(function(c){var d=n.eq(c);q&&(a[0]=p.call(this,c,d.html())),d.domManip(a,b)});if(l&&(i=m.buildFragment(a,this[0].ownerDocument,!1,this),c=i.firstChild,1===i.childNodes.length&&(i=c),c)){for(g=m.map(ub(i,"script"),xb),f=g.length;l>j;j++)d=i,j!==o&&(d=m.clone(d,!0,!0),f&&m.merge(g,ub(d,"script"))),b.call(this[j],d,j);if(f)for(h=g[g.length-1].ownerDocument,m.map(g,yb),j=0;f>j;j++)d=g[j],ob.test(d.type||"")&&!m._data(d,"globalEval")&&m.contains(h,d)&&(d.src?m._evalUrl&&m._evalUrl(d.src):m.globalEval((d.text||d.textContent||d.innerHTML||"").replace(qb,"")));i=c=null}return this}}),m.each({appendTo:"append",prependTo:"prepend",insertBefore:"before",insertAfter:"after",replaceAll:"replaceWith"},function(a,b){m.fn[a]=function(a){for(var c,d=0,e=[],g=m(a),h=g.length-1;h>=d;d++)c=d===h?this:this.clone(!0),m(g[d])[b](c),f.apply(e,c.get());return this.pushStack(e)}});var Cb,Db={};function Eb(b,c){var d,e=m(c.createElement(b)).appendTo(c.body),f=a.getDefaultComputedStyle&&(d=a.getDefaultComputedStyle(e[0]))?d.display:m.css(e[0],"display");return e.detach(),f}function Fb(a){var b=y,c=Db[a];return c||(c=Eb(a,b),"none"!==c&&c||(Cb=(Cb||m("<iframe frameborder='0' width='0' height='0'/>")).appendTo(b.documentElement),b=(Cb[0].contentWindow||Cb[0].contentDocument).document,b.write(),b.close(),c=Eb(a,b),Cb.detach()),Db[a]=c),c}!function(){var a;k.shrinkWrapBlocks=function(){if(null!=a)return a;a=!1;var b,c,d;return c=y.getElementsByTagName("body")[0],c&&c.style?(b=y.createElement("div"),d=y.createElement("div"),d.style.cssText="position:absolute;border:0;width:0;height:0;top:0;left:-9999px",c.appendChild(d).appendChild(b),typeof b.style.zoom!==K&&(b.style.cssText="-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;display:block;margin:0;border:0;padding:1px;width:1px;zoom:1",b.appendChild(y.createElement("div")).style.width="5px",a=3!==b.offsetWidth),c.removeChild(d),a):void 0}}();var Gb=/^margin/,Hb=new RegExp("^("+S+")(?!px)[a-z%]+$","i"),Ib,Jb,Kb=/^(top|right|bottom|left)$/;a.getComputedStyle?(Ib=function(a){return a.ownerDocument.defaultView.getComputedStyle(a,null)},Jb=function(a,b,c){var d,e,f,g,h=a.style;return c=c||Ib(a),g=c?c.getPropertyValue(b)||c[b]:void 0,c&&(""!==g||m.contains(a.ownerDocument,a)||(g=m.style(a,b)),Hb.test(g)&&Gb.test(b)&&(d=h.width,e=h.minWidth,f=h.maxWidth,h.minWidth=h.maxWidth=h.width=g,g=c.width,h.width=d,h.minWidth=e,h.maxWidth=f)),void 0===g?g:g+""}):y.documentElement.currentStyle&&(Ib=function(a){return a.currentStyle},Jb=function(a,b,c){var d,e,f,g,h=a.style;return c=c||Ib(a),g=c?c[b]:void 0,null==g&&h&&h[b]&&(g=h[b]),Hb.test(g)&&!Kb.test(b)&&(d=h.left,e=a.runtimeStyle,f=e&&e.left,f&&(e.left=a.currentStyle.left),h.left="fontSize"===b?"1em":g,g=h.pixelLeft+"px",h.left=d,f&&(e.left=f)),void 0===g?g:g+""||"auto"});function Lb(a,b){return{get:function(){var c=a();if(null!=c)return c?void delete this.get:(this.get=b).apply(this,arguments)}}}!function(){var b,c,d,e,f,g,h;if(b=y.createElement("div"),b.innerHTML="  <link/><table></table><a href='/a'>a</a><input type='checkbox'/>",d=b.getElementsByTagName("a")[0],c=d&&d.style){c.cssText="float:left;opacity:.5",k.opacity="0.5"===c.opacity,k.cssFloat=!!c.cssFloat,b.style.backgroundClip="content-box",b.cloneNode(!0).style.backgroundClip="",k.clearCloneStyle="content-box"===b.style.backgroundClip,k.boxSizing=""===c.boxSizing||""===c.MozBoxSizing||""===c.WebkitBoxSizing,m.extend(k,{reliableHiddenOffsets:function(){return null==g&&i(),g},boxSizingReliable:function(){return null==f&&i(),f},pixelPosition:function(){return null==e&&i(),e},reliableMarginRight:function(){return null==h&&i(),h}});function i(){var b,c,d,i;c=y.getElementsByTagName("body")[0],c&&c.style&&(b=y.createElement("div"),d=y.createElement("div"),d.style.cssText="position:absolute;border:0;width:0;height:0;top:0;left:-9999px",c.appendChild(d).appendChild(b),b.style.cssText="-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;display:block;margin-top:1%;top:1%;border:1px;padding:1px;width:4px;position:absolute",e=f=!1,h=!0,a.getComputedStyle&&(e="1%"!==(a.getComputedStyle(b,null)||{}).top,f="4px"===(a.getComputedStyle(b,null)||{width:"4px"}).width,i=b.appendChild(y.createElement("div")),i.style.cssText=b.style.cssText="-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;display:block;margin:0;border:0;padding:0",i.style.marginRight=i.style.width="0",b.style.width="1px",h=!parseFloat((a.getComputedStyle(i,null)||{}).marginRight)),b.innerHTML="<table><tr><td></td><td>t</td></tr></table>",i=b.getElementsByTagName("td"),i[0].style.cssText="margin:0;border:0;padding:0;display:none",g=0===i[0].offsetHeight,g&&(i[0].style.display="",i[1].style.display="none",g=0===i[0].offsetHeight),c.removeChild(d))}}}(),m.swap=function(a,b,c,d){var e,f,g={};for(f in b)g[f]=a.style[f],a.style[f]=b[f];e=c.apply(a,d||[]);for(f in b)a.style[f]=g[f];return e};var Mb=/alpha\([^)]*\)/i,Nb=/opacity\s*=\s*([^)]*)/,Ob=/^(none|table(?!-c[ea]).+)/,Pb=new RegExp("^("+S+")(.*)$","i"),Qb=new RegExp("^([+-])=("+S+")","i"),Rb={position:"absolute",visibility:"hidden",display:"block"},Sb={letterSpacing:"0",fontWeight:"400"},Tb=["Webkit","O","Moz","ms"];function Ub(a,b){if(b in a)return b;var c=b.charAt(0).toUpperCase()+b.slice(1),d=b,e=Tb.length;while(e--)if(b=Tb[e]+c,b in a)return b;return d}function Vb(a,b){for(var c,d,e,f=[],g=0,h=a.length;h>g;g++)d=a[g],d.style&&(f[g]=m._data(d,"olddisplay"),c=d.style.display,b?(f[g]||"none"!==c||(d.style.display=""),""===d.style.display&&U(d)&&(f[g]=m._data(d,"olddisplay",Fb(d.nodeName)))):(e=U(d),(c&&"none"!==c||!e)&&m._data(d,"olddisplay",e?c:m.css(d,"display"))));for(g=0;h>g;g++)d=a[g],d.style&&(b&&"none"!==d.style.display&&""!==d.style.display||(d.style.display=b?f[g]||"":"none"));return a}function Wb(a,b,c){var d=Pb.exec(b);return d?Math.max(0,d[1]-(c||0))+(d[2]||"px"):b}function Xb(a,b,c,d,e){for(var f=c===(d?"border":"content")?4:"width"===b?1:0,g=0;4>f;f+=2)"margin"===c&&(g+=m.css(a,c+T[f],!0,e)),d?("content"===c&&(g-=m.css(a,"padding"+T[f],!0,e)),"margin"!==c&&(g-=m.css(a,"border"+T[f]+"Width",!0,e))):(g+=m.css(a,"padding"+T[f],!0,e),"padding"!==c&&(g+=m.css(a,"border"+T[f]+"Width",!0,e)));return g}function Yb(a,b,c){var d=!0,e="width"===b?a.offsetWidth:a.offsetHeight,f=Ib(a),g=k.boxSizing&&"border-box"===m.css(a,"boxSizing",!1,f);if(0>=e||null==e){if(e=Jb(a,b,f),(0>e||null==e)&&(e=a.style[b]),Hb.test(e))return e;d=g&&(k.boxSizingReliable()||e===a.style[b]),e=parseFloat(e)||0}return e+Xb(a,b,c||(g?"border":"content"),d,f)+"px"}m.extend({cssHooks:{opacity:{get:function(a,b){if(b){var c=Jb(a,"opacity");return""===c?"1":c}}}},cssNumber:{columnCount:!0,fillOpacity:!0,flexGrow:!0,flexShrink:!0,fontWeight:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,widows:!0,zIndex:!0,zoom:!0},cssProps:{"float":k.cssFloat?"cssFloat":"styleFloat"},style:function(a,b,c,d){if(a&&3!==a.nodeType&&8!==a.nodeType&&a.style){var e,f,g,h=m.camelCase(b),i=a.style;if(b=m.cssProps[h]||(m.cssProps[h]=Ub(i,h)),g=m.cssHooks[b]||m.cssHooks[h],void 0===c)return g&&"get"in g&&void 0!==(e=g.get(a,!1,d))?e:i[b];if(f=typeof c,"string"===f&&(e=Qb.exec(c))&&(c=(e[1]+1)*e[2]+parseFloat(m.css(a,b)),f="number"),null!=c&&c===c&&("number"!==f||m.cssNumber[h]||(c+="px"),k.clearCloneStyle||""!==c||0!==b.indexOf("background")||(i[b]="inherit"),!(g&&"set"in g&&void 0===(c=g.set(a,c,d)))))try{i[b]=c}catch(j){}}},css:function(a,b,c,d){var e,f,g,h=m.camelCase(b);return b=m.cssProps[h]||(m.cssProps[h]=Ub(a.style,h)),g=m.cssHooks[b]||m.cssHooks[h],g&&"get"in g&&(f=g.get(a,!0,c)),void 0===f&&(f=Jb(a,b,d)),"normal"===f&&b in Sb&&(f=Sb[b]),""===c||c?(e=parseFloat(f),c===!0||m.isNumeric(e)?e||0:f):f}}),m.each(["height","width"],function(a,b){m.cssHooks[b]={get:function(a,c,d){return c?Ob.test(m.css(a,"display"))&&0===a.offsetWidth?m.swap(a,Rb,function(){return Yb(a,b,d)}):Yb(a,b,d):void 0},set:function(a,c,d){var e=d&&Ib(a);return Wb(a,c,d?Xb(a,b,d,k.boxSizing&&"border-box"===m.css(a,"boxSizing",!1,e),e):0)}}}),k.opacity||(m.cssHooks.opacity={get:function(a,b){return Nb.test((b&&a.currentStyle?a.currentStyle.filter:a.style.filter)||"")?.01*parseFloat(RegExp.$1)+"":b?"1":""},set:function(a,b){var c=a.style,d=a.currentStyle,e=m.isNumeric(b)?"alpha(opacity="+100*b+")":"",f=d&&d.filter||c.filter||"";c.zoom=1,(b>=1||""===b)&&""===m.trim(f.replace(Mb,""))&&c.removeAttribute&&(c.removeAttribute("filter"),""===b||d&&!d.filter)||(c.filter=Mb.test(f)?f.replace(Mb,e):f+" "+e)}}),m.cssHooks.marginRight=Lb(k.reliableMarginRight,function(a,b){return b?m.swap(a,{display:"inline-block"},Jb,[a,"marginRight"]):void 0}),m.each({margin:"",padding:"",border:"Width"},function(a,b){m.cssHooks[a+b]={expand:function(c){for(var d=0,e={},f="string"==typeof c?c.split(" "):[c];4>d;d++)e[a+T[d]+b]=f[d]||f[d-2]||f[0];return e}},Gb.test(a)||(m.cssHooks[a+b].set=Wb)}),m.fn.extend({css:function(a,b){return V(this,function(a,b,c){var d,e,f={},g=0;if(m.isArray(b)){for(d=Ib(a),e=b.length;e>g;g++)f[b[g]]=m.css(a,b[g],!1,d);return f}return void 0!==c?m.style(a,b,c):m.css(a,b)},a,b,arguments.length>1)},show:function(){return Vb(this,!0)},hide:function(){return Vb(this)},toggle:function(a){return"boolean"==typeof a?a?this.show():this.hide():this.each(function(){U(this)?m(this).show():m(this).hide()})}});function Zb(a,b,c,d,e){return new Zb.prototype.init(a,b,c,d,e)}m.Tween=Zb,Zb.prototype={constructor:Zb,init:function(a,b,c,d,e,f){this.elem=a,this.prop=c,this.easing=e||"swing",this.options=b,this.start=this.now=this.cur(),this.end=d,this.unit=f||(m.cssNumber[c]?"":"px")
+},cur:function(){var a=Zb.propHooks[this.prop];return a&&a.get?a.get(this):Zb.propHooks._default.get(this)},run:function(a){var b,c=Zb.propHooks[this.prop];return this.pos=b=this.options.duration?m.easing[this.easing](a,this.options.duration*a,0,1,this.options.duration):a,this.now=(this.end-this.start)*b+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),c&&c.set?c.set(this):Zb.propHooks._default.set(this),this}},Zb.prototype.init.prototype=Zb.prototype,Zb.propHooks={_default:{get:function(a){var b;return null==a.elem[a.prop]||a.elem.style&&null!=a.elem.style[a.prop]?(b=m.css(a.elem,a.prop,""),b&&"auto"!==b?b:0):a.elem[a.prop]},set:function(a){m.fx.step[a.prop]?m.fx.step[a.prop](a):a.elem.style&&(null!=a.elem.style[m.cssProps[a.prop]]||m.cssHooks[a.prop])?m.style(a.elem,a.prop,a.now+a.unit):a.elem[a.prop]=a.now}}},Zb.propHooks.scrollTop=Zb.propHooks.scrollLeft={set:function(a){a.elem.nodeType&&a.elem.parentNode&&(a.elem[a.prop]=a.now)}},m.easing={linear:function(a){return a},swing:function(a){return.5-Math.cos(a*Math.PI)/2}},m.fx=Zb.prototype.init,m.fx.step={};var $b,_b,ac=/^(?:toggle|show|hide)$/,bc=new RegExp("^(?:([+-])=|)("+S+")([a-z%]*)$","i"),cc=/queueHooks$/,dc=[ic],ec={"*":[function(a,b){var c=this.createTween(a,b),d=c.cur(),e=bc.exec(b),f=e&&e[3]||(m.cssNumber[a]?"":"px"),g=(m.cssNumber[a]||"px"!==f&&+d)&&bc.exec(m.css(c.elem,a)),h=1,i=20;if(g&&g[3]!==f){f=f||g[3],e=e||[],g=+d||1;do h=h||".5",g/=h,m.style(c.elem,a,g+f);while(h!==(h=c.cur()/d)&&1!==h&&--i)}return e&&(g=c.start=+g||+d||0,c.unit=f,c.end=e[1]?g+(e[1]+1)*e[2]:+e[2]),c}]};function fc(){return setTimeout(function(){$b=void 0}),$b=m.now()}function gc(a,b){var c,d={height:a},e=0;for(b=b?1:0;4>e;e+=2-b)c=T[e],d["margin"+c]=d["padding"+c]=a;return b&&(d.opacity=d.width=a),d}function hc(a,b,c){for(var d,e=(ec[b]||[]).concat(ec["*"]),f=0,g=e.length;g>f;f++)if(d=e[f].call(c,b,a))return d}function ic(a,b,c){var d,e,f,g,h,i,j,l,n=this,o={},p=a.style,q=a.nodeType&&U(a),r=m._data(a,"fxshow");c.queue||(h=m._queueHooks(a,"fx"),null==h.unqueued&&(h.unqueued=0,i=h.empty.fire,h.empty.fire=function(){h.unqueued||i()}),h.unqueued++,n.always(function(){n.always(function(){h.unqueued--,m.queue(a,"fx").length||h.empty.fire()})})),1===a.nodeType&&("height"in b||"width"in b)&&(c.overflow=[p.overflow,p.overflowX,p.overflowY],j=m.css(a,"display"),l="none"===j?m._data(a,"olddisplay")||Fb(a.nodeName):j,"inline"===l&&"none"===m.css(a,"float")&&(k.inlineBlockNeedsLayout&&"inline"!==Fb(a.nodeName)?p.zoom=1:p.display="inline-block")),c.overflow&&(p.overflow="hidden",k.shrinkWrapBlocks()||n.always(function(){p.overflow=c.overflow[0],p.overflowX=c.overflow[1],p.overflowY=c.overflow[2]}));for(d in b)if(e=b[d],ac.exec(e)){if(delete b[d],f=f||"toggle"===e,e===(q?"hide":"show")){if("show"!==e||!r||void 0===r[d])continue;q=!0}o[d]=r&&r[d]||m.style(a,d)}else j=void 0;if(m.isEmptyObject(o))"inline"===("none"===j?Fb(a.nodeName):j)&&(p.display=j);else{r?"hidden"in r&&(q=r.hidden):r=m._data(a,"fxshow",{}),f&&(r.hidden=!q),q?m(a).show():n.done(function(){m(a).hide()}),n.done(function(){var b;m._removeData(a,"fxshow");for(b in o)m.style(a,b,o[b])});for(d in o)g=hc(q?r[d]:0,d,n),d in r||(r[d]=g.start,q&&(g.end=g.start,g.start="width"===d||"height"===d?1:0))}}function jc(a,b){var c,d,e,f,g;for(c in a)if(d=m.camelCase(c),e=b[d],f=a[c],m.isArray(f)&&(e=f[1],f=a[c]=f[0]),c!==d&&(a[d]=f,delete a[c]),g=m.cssHooks[d],g&&"expand"in g){f=g.expand(f),delete a[d];for(c in f)c in a||(a[c]=f[c],b[c]=e)}else b[d]=e}function kc(a,b,c){var d,e,f=0,g=dc.length,h=m.Deferred().always(function(){delete i.elem}),i=function(){if(e)return!1;for(var b=$b||fc(),c=Math.max(0,j.startTime+j.duration-b),d=c/j.duration||0,f=1-d,g=0,i=j.tweens.length;i>g;g++)j.tweens[g].run(f);return h.notifyWith(a,[j,f,c]),1>f&&i?c:(h.resolveWith(a,[j]),!1)},j=h.promise({elem:a,props:m.extend({},b),opts:m.extend(!0,{specialEasing:{}},c),originalProperties:b,originalOptions:c,startTime:$b||fc(),duration:c.duration,tweens:[],createTween:function(b,c){var d=m.Tween(a,j.opts,b,c,j.opts.specialEasing[b]||j.opts.easing);return j.tweens.push(d),d},stop:function(b){var c=0,d=b?j.tweens.length:0;if(e)return this;for(e=!0;d>c;c++)j.tweens[c].run(1);return b?h.resolveWith(a,[j,b]):h.rejectWith(a,[j,b]),this}}),k=j.props;for(jc(k,j.opts.specialEasing);g>f;f++)if(d=dc[f].call(j,a,k,j.opts))return d;return m.map(k,hc,j),m.isFunction(j.opts.start)&&j.opts.start.call(a,j),m.fx.timer(m.extend(i,{elem:a,anim:j,queue:j.opts.queue})),j.progress(j.opts.progress).done(j.opts.done,j.opts.complete).fail(j.opts.fail).always(j.opts.always)}m.Animation=m.extend(kc,{tweener:function(a,b){m.isFunction(a)?(b=a,a=["*"]):a=a.split(" ");for(var c,d=0,e=a.length;e>d;d++)c=a[d],ec[c]=ec[c]||[],ec[c].unshift(b)},prefilter:function(a,b){b?dc.unshift(a):dc.push(a)}}),m.speed=function(a,b,c){var d=a&&"object"==typeof a?m.extend({},a):{complete:c||!c&&b||m.isFunction(a)&&a,duration:a,easing:c&&b||b&&!m.isFunction(b)&&b};return d.duration=m.fx.off?0:"number"==typeof d.duration?d.duration:d.duration in m.fx.speeds?m.fx.speeds[d.duration]:m.fx.speeds._default,(null==d.queue||d.queue===!0)&&(d.queue="fx"),d.old=d.complete,d.complete=function(){m.isFunction(d.old)&&d.old.call(this),d.queue&&m.dequeue(this,d.queue)},d},m.fn.extend({fadeTo:function(a,b,c,d){return this.filter(U).css("opacity",0).show().end().animate({opacity:b},a,c,d)},animate:function(a,b,c,d){var e=m.isEmptyObject(a),f=m.speed(b,c,d),g=function(){var b=kc(this,m.extend({},a),f);(e||m._data(this,"finish"))&&b.stop(!0)};return g.finish=g,e||f.queue===!1?this.each(g):this.queue(f.queue,g)},stop:function(a,b,c){var d=function(a){var b=a.stop;delete a.stop,b(c)};return"string"!=typeof a&&(c=b,b=a,a=void 0),b&&a!==!1&&this.queue(a||"fx",[]),this.each(function(){var b=!0,e=null!=a&&a+"queueHooks",f=m.timers,g=m._data(this);if(e)g[e]&&g[e].stop&&d(g[e]);else for(e in g)g[e]&&g[e].stop&&cc.test(e)&&d(g[e]);for(e=f.length;e--;)f[e].elem!==this||null!=a&&f[e].queue!==a||(f[e].anim.stop(c),b=!1,f.splice(e,1));(b||!c)&&m.dequeue(this,a)})},finish:function(a){return a!==!1&&(a=a||"fx"),this.each(function(){var b,c=m._data(this),d=c[a+"queue"],e=c[a+"queueHooks"],f=m.timers,g=d?d.length:0;for(c.finish=!0,m.queue(this,a,[]),e&&e.stop&&e.stop.call(this,!0),b=f.length;b--;)f[b].elem===this&&f[b].queue===a&&(f[b].anim.stop(!0),f.splice(b,1));for(b=0;g>b;b++)d[b]&&d[b].finish&&d[b].finish.call(this);delete c.finish})}}),m.each(["toggle","show","hide"],function(a,b){var c=m.fn[b];m.fn[b]=function(a,d,e){return null==a||"boolean"==typeof a?c.apply(this,arguments):this.animate(gc(b,!0),a,d,e)}}),m.each({slideDown:gc("show"),slideUp:gc("hide"),slideToggle:gc("toggle"),fadeIn:{opacity:"show"},fadeOut:{opacity:"hide"},fadeToggle:{opacity:"toggle"}},function(a,b){m.fn[a]=function(a,c,d){return this.animate(b,a,c,d)}}),m.timers=[],m.fx.tick=function(){var a,b=m.timers,c=0;for($b=m.now();c<b.length;c++)a=b[c],a()||b[c]!==a||b.splice(c--,1);b.length||m.fx.stop(),$b=void 0},m.fx.timer=function(a){m.timers.push(a),a()?m.fx.start():m.timers.pop()},m.fx.interval=13,m.fx.start=function(){_b||(_b=setInterval(m.fx.tick,m.fx.interval))},m.fx.stop=function(){clearInterval(_b),_b=null},m.fx.speeds={slow:600,fast:200,_default:400},m.fn.delay=function(a,b){return a=m.fx?m.fx.speeds[a]||a:a,b=b||"fx",this.queue(b,function(b,c){var d=setTimeout(b,a);c.stop=function(){clearTimeout(d)}})},function(){var a,b,c,d,e;b=y.createElement("div"),b.setAttribute("className","t"),b.innerHTML="  <link/><table></table><a href='/a'>a</a><input type='checkbox'/>",d=b.getElementsByTagName("a")[0],c=y.createElement("select"),e=c.appendChild(y.createElement("option")),a=b.getElementsByTagName("input")[0],d.style.cssText="top:1px",k.getSetAttribute="t"!==b.className,k.style=/top/.test(d.getAttribute("style")),k.hrefNormalized="/a"===d.getAttribute("href"),k.checkOn=!!a.value,k.optSelected=e.selected,k.enctype=!!y.createElement("form").enctype,c.disabled=!0,k.optDisabled=!e.disabled,a=y.createElement("input"),a.setAttribute("value",""),k.input=""===a.getAttribute("value"),a.value="t",a.setAttribute("type","radio"),k.radioValue="t"===a.value}();var lc=/\r/g;m.fn.extend({val:function(a){var b,c,d,e=this[0];{if(arguments.length)return d=m.isFunction(a),this.each(function(c){var e;1===this.nodeType&&(e=d?a.call(this,c,m(this).val()):a,null==e?e="":"number"==typeof e?e+="":m.isArray(e)&&(e=m.map(e,function(a){return null==a?"":a+""})),b=m.valHooks[this.type]||m.valHooks[this.nodeName.toLowerCase()],b&&"set"in b&&void 0!==b.set(this,e,"value")||(this.value=e))});if(e)return b=m.valHooks[e.type]||m.valHooks[e.nodeName.toLowerCase()],b&&"get"in b&&void 0!==(c=b.get(e,"value"))?c:(c=e.value,"string"==typeof c?c.replace(lc,""):null==c?"":c)}}}),m.extend({valHooks:{option:{get:function(a){var b=m.find.attr(a,"value");return null!=b?b:m.trim(m.text(a))}},select:{get:function(a){for(var b,c,d=a.options,e=a.selectedIndex,f="select-one"===a.type||0>e,g=f?null:[],h=f?e+1:d.length,i=0>e?h:f?e:0;h>i;i++)if(c=d[i],!(!c.selected&&i!==e||(k.optDisabled?c.disabled:null!==c.getAttribute("disabled"))||c.parentNode.disabled&&m.nodeName(c.parentNode,"optgroup"))){if(b=m(c).val(),f)return b;g.push(b)}return g},set:function(a,b){var c,d,e=a.options,f=m.makeArray(b),g=e.length;while(g--)if(d=e[g],m.inArray(m.valHooks.option.get(d),f)>=0)try{d.selected=c=!0}catch(h){d.scrollHeight}else d.selected=!1;return c||(a.selectedIndex=-1),e}}}}),m.each(["radio","checkbox"],function(){m.valHooks[this]={set:function(a,b){return m.isArray(b)?a.checked=m.inArray(m(a).val(),b)>=0:void 0}},k.checkOn||(m.valHooks[this].get=function(a){return null===a.getAttribute("value")?"on":a.value})});var mc,nc,oc=m.expr.attrHandle,pc=/^(?:checked|selected)$/i,qc=k.getSetAttribute,rc=k.input;m.fn.extend({attr:function(a,b){return V(this,m.attr,a,b,arguments.length>1)},removeAttr:function(a){return this.each(function(){m.removeAttr(this,a)})}}),m.extend({attr:function(a,b,c){var d,e,f=a.nodeType;if(a&&3!==f&&8!==f&&2!==f)return typeof a.getAttribute===K?m.prop(a,b,c):(1===f&&m.isXMLDoc(a)||(b=b.toLowerCase(),d=m.attrHooks[b]||(m.expr.match.bool.test(b)?nc:mc)),void 0===c?d&&"get"in d&&null!==(e=d.get(a,b))?e:(e=m.find.attr(a,b),null==e?void 0:e):null!==c?d&&"set"in d&&void 0!==(e=d.set(a,c,b))?e:(a.setAttribute(b,c+""),c):void m.removeAttr(a,b))},removeAttr:function(a,b){var c,d,e=0,f=b&&b.match(E);if(f&&1===a.nodeType)while(c=f[e++])d=m.propFix[c]||c,m.expr.match.bool.test(c)?rc&&qc||!pc.test(c)?a[d]=!1:a[m.camelCase("default-"+c)]=a[d]=!1:m.attr(a,c,""),a.removeAttribute(qc?c:d)},attrHooks:{type:{set:function(a,b){if(!k.radioValue&&"radio"===b&&m.nodeName(a,"input")){var c=a.value;return a.setAttribute("type",b),c&&(a.value=c),b}}}}}),nc={set:function(a,b,c){return b===!1?m.removeAttr(a,c):rc&&qc||!pc.test(c)?a.setAttribute(!qc&&m.propFix[c]||c,c):a[m.camelCase("default-"+c)]=a[c]=!0,c}},m.each(m.expr.match.bool.source.match(/\w+/g),function(a,b){var c=oc[b]||m.find.attr;oc[b]=rc&&qc||!pc.test(b)?function(a,b,d){var e,f;return d||(f=oc[b],oc[b]=e,e=null!=c(a,b,d)?b.toLowerCase():null,oc[b]=f),e}:function(a,b,c){return c?void 0:a[m.camelCase("default-"+b)]?b.toLowerCase():null}}),rc&&qc||(m.attrHooks.value={set:function(a,b,c){return m.nodeName(a,"input")?void(a.defaultValue=b):mc&&mc.set(a,b,c)}}),qc||(mc={set:function(a,b,c){var d=a.getAttributeNode(c);return d||a.setAttributeNode(d=a.ownerDocument.createAttribute(c)),d.value=b+="","value"===c||b===a.getAttribute(c)?b:void 0}},oc.id=oc.name=oc.coords=function(a,b,c){var d;return c?void 0:(d=a.getAttributeNode(b))&&""!==d.value?d.value:null},m.valHooks.button={get:function(a,b){var c=a.getAttributeNode(b);return c&&c.specified?c.value:void 0},set:mc.set},m.attrHooks.contenteditable={set:function(a,b,c){mc.set(a,""===b?!1:b,c)}},m.each(["width","height"],function(a,b){m.attrHooks[b]={set:function(a,c){return""===c?(a.setAttribute(b,"auto"),c):void 0}}})),k.style||(m.attrHooks.style={get:function(a){return a.style.cssText||void 0},set:function(a,b){return a.style.cssText=b+""}});var sc=/^(?:input|select|textarea|button|object)$/i,tc=/^(?:a|area)$/i;m.fn.extend({prop:function(a,b){return V(this,m.prop,a,b,arguments.length>1)},removeProp:function(a){return a=m.propFix[a]||a,this.each(function(){try{this[a]=void 0,delete this[a]}catch(b){}})}}),m.extend({propFix:{"for":"htmlFor","class":"className"},prop:function(a,b,c){var d,e,f,g=a.nodeType;if(a&&3!==g&&8!==g&&2!==g)return f=1!==g||!m.isXMLDoc(a),f&&(b=m.propFix[b]||b,e=m.propHooks[b]),void 0!==c?e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:a[b]=c:e&&"get"in e&&null!==(d=e.get(a,b))?d:a[b]},propHooks:{tabIndex:{get:function(a){var b=m.find.attr(a,"tabindex");return b?parseInt(b,10):sc.test(a.nodeName)||tc.test(a.nodeName)&&a.href?0:-1}}}}),k.hrefNormalized||m.each(["href","src"],function(a,b){m.propHooks[b]={get:function(a){return a.getAttribute(b,4)}}}),k.optSelected||(m.propHooks.selected={get:function(a){var b=a.parentNode;return b&&(b.selectedIndex,b.parentNode&&b.parentNode.selectedIndex),null}}),m.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){m.propFix[this.toLowerCase()]=this}),k.enctype||(m.propFix.enctype="encoding");var uc=/[\t\r\n\f]/g;m.fn.extend({addClass:function(a){var b,c,d,e,f,g,h=0,i=this.length,j="string"==typeof a&&a;if(m.isFunction(a))return this.each(function(b){m(this).addClass(a.call(this,b,this.className))});if(j)for(b=(a||"").match(E)||[];i>h;h++)if(c=this[h],d=1===c.nodeType&&(c.className?(" "+c.className+" ").replace(uc," "):" ")){f=0;while(e=b[f++])d.indexOf(" "+e+" ")<0&&(d+=e+" ");g=m.trim(d),c.className!==g&&(c.className=g)}return this},removeClass:function(a){var b,c,d,e,f,g,h=0,i=this.length,j=0===arguments.length||"string"==typeof a&&a;if(m.isFunction(a))return this.each(function(b){m(this).removeClass(a.call(this,b,this.className))});if(j)for(b=(a||"").match(E)||[];i>h;h++)if(c=this[h],d=1===c.nodeType&&(c.className?(" "+c.className+" ").replace(uc," "):"")){f=0;while(e=b[f++])while(d.indexOf(" "+e+" ")>=0)d=d.replace(" "+e+" "," ");g=a?m.trim(d):"",c.className!==g&&(c.className=g)}return this},toggleClass:function(a,b){var c=typeof a;return"boolean"==typeof b&&"string"===c?b?this.addClass(a):this.removeClass(a):this.each(m.isFunction(a)?function(c){m(this).toggleClass(a.call(this,c,this.className,b),b)}:function(){if("string"===c){var b,d=0,e=m(this),f=a.match(E)||[];while(b=f[d++])e.hasClass(b)?e.removeClass(b):e.addClass(b)}else(c===K||"boolean"===c)&&(this.className&&m._data(this,"__className__",this.className),this.className=this.className||a===!1?"":m._data(this,"__className__")||"")})},hasClass:function(a){for(var b=" "+a+" ",c=0,d=this.length;d>c;c++)if(1===this[c].nodeType&&(" "+this[c].className+" ").replace(uc," ").indexOf(b)>=0)return!0;return!1}}),m.each("blur focus focusin focusout load resize scroll unload click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup error contextmenu".split(" "),function(a,b){m.fn[b]=function(a,c){return arguments.length>0?this.on(b,null,a,c):this.trigger(b)}}),m.fn.extend({hover:function(a,b){return this.mouseenter(a).mouseleave(b||a)},bind:function(a,b,c){return this.on(a,null,b,c)},unbind:function(a,b){return this.off(a,null,b)},delegate:function(a,b,c,d){return this.on(b,a,c,d)},undelegate:function(a,b,c){return 1===arguments.length?this.off(a,"**"):this.off(b,a||"**",c)}});var vc=m.now(),wc=/\?/,xc=/(,)|(\[|{)|(}|])|"(?:[^"\\\r\n]|\\["\\\/bfnrt]|\\u[\da-fA-F]{4})*"\s*:?|true|false|null|-?(?!0\d)\d+(?:\.\d+|)(?:[eE][+-]?\d+|)/g;m.parseJSON=function(b){if(a.JSON&&a.JSON.parse)return a.JSON.parse(b+"");var c,d=null,e=m.trim(b+"");return e&&!m.trim(e.replace(xc,function(a,b,e,f){return c&&b&&(d=0),0===d?a:(c=e||b,d+=!f-!e,"")}))?Function("return "+e)():m.error("Invalid JSON: "+b)},m.parseXML=function(b){var c,d;if(!b||"string"!=typeof b)return null;try{a.DOMParser?(d=new DOMParser,c=d.parseFromString(b,"text/xml")):(c=new ActiveXObject("Microsoft.XMLDOM"),c.async="false",c.loadXML(b))}catch(e){c=void 0}return c&&c.documentElement&&!c.getElementsByTagName("parsererror").length||m.error("Invalid XML: "+b),c};var yc,zc,Ac=/#.*$/,Bc=/([?&])_=[^&]*/,Cc=/^(.*?):[ \t]*([^\r\n]*)\r?$/gm,Dc=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,Ec=/^(?:GET|HEAD)$/,Fc=/^\/\//,Gc=/^([\w.+-]+:)(?:\/\/(?:[^\/?#]*@|)([^\/?#:]*)(?::(\d+)|)|)/,Hc={},Ic={},Jc="*/".concat("*");try{zc=location.href}catch(Kc){zc=y.createElement("a"),zc.href="",zc=zc.href}yc=Gc.exec(zc.toLowerCase())||[];function Lc(a){return function(b,c){"string"!=typeof b&&(c=b,b="*");var d,e=0,f=b.toLowerCase().match(E)||[];if(m.isFunction(c))while(d=f[e++])"+"===d.charAt(0)?(d=d.slice(1)||"*",(a[d]=a[d]||[]).unshift(c)):(a[d]=a[d]||[]).push(c)}}function Mc(a,b,c,d){var e={},f=a===Ic;function g(h){var i;return e[h]=!0,m.each(a[h]||[],function(a,h){var j=h(b,c,d);return"string"!=typeof j||f||e[j]?f?!(i=j):void 0:(b.dataTypes.unshift(j),g(j),!1)}),i}return g(b.dataTypes[0])||!e["*"]&&g("*")}function Nc(a,b){var c,d,e=m.ajaxSettings.flatOptions||{};for(d in b)void 0!==b[d]&&((e[d]?a:c||(c={}))[d]=b[d]);return c&&m.extend(!0,a,c),a}function Oc(a,b,c){var d,e,f,g,h=a.contents,i=a.dataTypes;while("*"===i[0])i.shift(),void 0===e&&(e=a.mimeType||b.getResponseHeader("Content-Type"));if(e)for(g in h)if(h[g]&&h[g].test(e)){i.unshift(g);break}if(i[0]in c)f=i[0];else{for(g in c){if(!i[0]||a.converters[g+" "+i[0]]){f=g;break}d||(d=g)}f=f||d}return f?(f!==i[0]&&i.unshift(f),c[f]):void 0}function Pc(a,b,c,d){var e,f,g,h,i,j={},k=a.dataTypes.slice();if(k[1])for(g in a.converters)j[g.toLowerCase()]=a.converters[g];f=k.shift();while(f)if(a.responseFields[f]&&(c[a.responseFields[f]]=b),!i&&d&&a.dataFilter&&(b=a.dataFilter(b,a.dataType)),i=f,f=k.shift())if("*"===f)f=i;else if("*"!==i&&i!==f){if(g=j[i+" "+f]||j["* "+f],!g)for(e in j)if(h=e.split(" "),h[1]===f&&(g=j[i+" "+h[0]]||j["* "+h[0]])){g===!0?g=j[e]:j[e]!==!0&&(f=h[0],k.unshift(h[1]));break}if(g!==!0)if(g&&a["throws"])b=g(b);else try{b=g(b)}catch(l){return{state:"parsererror",error:g?l:"No conversion from "+i+" to "+f}}}return{state:"success",data:b}}m.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:zc,type:"GET",isLocal:Dc.test(yc[1]),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Jc,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/xml/,html:/html/,json:/json/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":m.parseJSON,"text xml":m.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(a,b){return b?Nc(Nc(a,m.ajaxSettings),b):Nc(m.ajaxSettings,a)},ajaxPrefilter:Lc(Hc),ajaxTransport:Lc(Ic),ajax:function(a,b){"object"==typeof a&&(b=a,a=void 0),b=b||{};var c,d,e,f,g,h,i,j,k=m.ajaxSetup({},b),l=k.context||k,n=k.context&&(l.nodeType||l.jquery)?m(l):m.event,o=m.Deferred(),p=m.Callbacks("once memory"),q=k.statusCode||{},r={},s={},t=0,u="canceled",v={readyState:0,getResponseHeader:function(a){var b;if(2===t){if(!j){j={};while(b=Cc.exec(f))j[b[1].toLowerCase()]=b[2]}b=j[a.toLowerCase()]}return null==b?null:b},getAllResponseHeaders:function(){return 2===t?f:null},setRequestHeader:function(a,b){var c=a.toLowerCase();return t||(a=s[c]=s[c]||a,r[a]=b),this},overrideMimeType:function(a){return t||(k.mimeType=a),this},statusCode:function(a){var b;if(a)if(2>t)for(b in a)q[b]=[q[b],a[b]];else v.always(a[v.status]);return this},abort:function(a){var b=a||u;return i&&i.abort(b),x(0,b),this}};if(o.promise(v).complete=p.add,v.success=v.done,v.error=v.fail,k.url=((a||k.url||zc)+"").replace(Ac,"").replace(Fc,yc[1]+"//"),k.type=b.method||b.type||k.method||k.type,k.dataTypes=m.trim(k.dataType||"*").toLowerCase().match(E)||[""],null==k.crossDomain&&(c=Gc.exec(k.url.toLowerCase()),k.crossDomain=!(!c||c[1]===yc[1]&&c[2]===yc[2]&&(c[3]||("http:"===c[1]?"80":"443"))===(yc[3]||("http:"===yc[1]?"80":"443")))),k.data&&k.processData&&"string"!=typeof k.data&&(k.data=m.param(k.data,k.traditional)),Mc(Hc,k,b,v),2===t)return v;h=k.global,h&&0===m.active++&&m.event.trigger("ajaxStart"),k.type=k.type.toUpperCase(),k.hasContent=!Ec.test(k.type),e=k.url,k.hasContent||(k.data&&(e=k.url+=(wc.test(e)?"&":"?")+k.data,delete k.data),k.cache===!1&&(k.url=Bc.test(e)?e.replace(Bc,"$1_="+vc++):e+(wc.test(e)?"&":"?")+"_="+vc++)),k.ifModified&&(m.lastModified[e]&&v.setRequestHeader("If-Modified-Since",m.lastModified[e]),m.etag[e]&&v.setRequestHeader("If-None-Match",m.etag[e])),(k.data&&k.hasContent&&k.contentType!==!1||b.contentType)&&v.setRequestHeader("Content-Type",k.contentType),v.setRequestHeader("Accept",k.dataTypes[0]&&k.accepts[k.dataTypes[0]]?k.accepts[k.dataTypes[0]]+("*"!==k.dataTypes[0]?", "+Jc+"; q=0.01":""):k.accepts["*"]);for(d in k.headers)v.setRequestHeader(d,k.headers[d]);if(k.beforeSend&&(k.beforeSend.call(l,v,k)===!1||2===t))return v.abort();u="abort";for(d in{success:1,error:1,complete:1})v[d](k[d]);if(i=Mc(Ic,k,b,v)){v.readyState=1,h&&n.trigger("ajaxSend",[v,k]),k.async&&k.timeout>0&&(g=setTimeout(function(){v.abort("timeout")},k.timeout));try{t=1,i.send(r,x)}catch(w){if(!(2>t))throw w;x(-1,w)}}else x(-1,"No Transport");function x(a,b,c,d){var j,r,s,u,w,x=b;2!==t&&(t=2,g&&clearTimeout(g),i=void 0,f=d||"",v.readyState=a>0?4:0,j=a>=200&&300>a||304===a,c&&(u=Oc(k,v,c)),u=Pc(k,u,v,j),j?(k.ifModified&&(w=v.getResponseHeader("Last-Modified"),w&&(m.lastModified[e]=w),w=v.getResponseHeader("etag"),w&&(m.etag[e]=w)),204===a||"HEAD"===k.type?x="nocontent":304===a?x="notmodified":(x=u.state,r=u.data,s=u.error,j=!s)):(s=x,(a||!x)&&(x="error",0>a&&(a=0))),v.status=a,v.statusText=(b||x)+"",j?o.resolveWith(l,[r,x,v]):o.rejectWith(l,[v,x,s]),v.statusCode(q),q=void 0,h&&n.trigger(j?"ajaxSuccess":"ajaxError",[v,k,j?r:s]),p.fireWith(l,[v,x]),h&&(n.trigger("ajaxComplete",[v,k]),--m.active||m.event.trigger("ajaxStop")))}return v},getJSON:function(a,b,c){return m.get(a,b,c,"json")},getScript:function(a,b){return m.get(a,void 0,b,"script")}}),m.each(["get","post"],function(a,b){m[b]=function(a,c,d,e){return m.isFunction(c)&&(e=e||d,d=c,c=void 0),m.ajax({url:a,type:b,dataType:e,data:c,success:d})}}),m.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(a,b){m.fn[b]=function(a){return this.on(b,a)}}),m._evalUrl=function(a){return m.ajax({url:a,type:"GET",dataType:"script",async:!1,global:!1,"throws":!0})},m.fn.extend({wrapAll:function(a){if(m.isFunction(a))return this.each(function(b){m(this).wrapAll(a.call(this,b))});if(this[0]){var b=m(a,this[0].ownerDocument).eq(0).clone(!0);this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){var a=this;while(a.firstChild&&1===a.firstChild.nodeType)a=a.firstChild;return a}).append(this)}return this},wrapInner:function(a){return this.each(m.isFunction(a)?function(b){m(this).wrapInner(a.call(this,b))}:function(){var b=m(this),c=b.contents();c.length?c.wrapAll(a):b.append(a)})},wrap:function(a){var b=m.isFunction(a);return this.each(function(c){m(this).wrapAll(b?a.call(this,c):a)})},unwrap:function(){return this.parent().each(function(){m.nodeName(this,"body")||m(this).replaceWith(this.childNodes)}).end()}}),m.expr.filters.hidden=function(a){return a.offsetWidth<=0&&a.offsetHeight<=0||!k.reliableHiddenOffsets()&&"none"===(a.style&&a.style.display||m.css(a,"display"))},m.expr.filters.visible=function(a){return!m.expr.filters.hidden(a)};var Qc=/%20/g,Rc=/\[\]$/,Sc=/\r?\n/g,Tc=/^(?:submit|button|image|reset|file)$/i,Uc=/^(?:input|select|textarea|keygen)/i;function Vc(a,b,c,d){var e;if(m.isArray(b))m.each(b,function(b,e){c||Rc.test(a)?d(a,e):Vc(a+"["+("object"==typeof e?b:"")+"]",e,c,d)});else if(c||"object"!==m.type(b))d(a,b);else for(e in b)Vc(a+"["+e+"]",b[e],c,d)}m.param=function(a,b){var c,d=[],e=function(a,b){b=m.isFunction(b)?b():null==b?"":b,d[d.length]=encodeURIComponent(a)+"="+encodeURIComponent(b)};if(void 0===b&&(b=m.ajaxSettings&&m.ajaxSettings.traditional),m.isArray(a)||a.jquery&&!m.isPlainObject(a))m.each(a,function(){e(this.name,this.value)});else for(c in a)Vc(c,a[c],b,e);return d.join("&").replace(Qc,"+")},m.fn.extend({serialize:function(){return m.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var a=m.prop(this,"elements");return a?m.makeArray(a):this}).filter(function(){var a=this.type;return this.name&&!m(this).is(":disabled")&&Uc.test(this.nodeName)&&!Tc.test(a)&&(this.checked||!W.test(a))}).map(function(a,b){var c=m(this).val();return null==c?null:m.isArray(c)?m.map(c,function(a){return{name:b.name,value:a.replace(Sc,"\r\n")}}):{name:b.name,value:c.replace(Sc,"\r\n")}}).get()}}),m.ajaxSettings.xhr=void 0!==a.ActiveXObject?function(){return!this.isLocal&&/^(get|post|head|put|delete|options)$/i.test(this.type)&&Zc()||$c()}:Zc;var Wc=0,Xc={},Yc=m.ajaxSettings.xhr();a.ActiveXObject&&m(a).on("unload",function(){for(var a in Xc)Xc[a](void 0,!0)}),k.cors=!!Yc&&"withCredentials"in Yc,Yc=k.ajax=!!Yc,Yc&&m.ajaxTransport(function(a){if(!a.crossDomain||k.cors){var b;return{send:function(c,d){var e,f=a.xhr(),g=++Wc;if(f.open(a.type,a.url,a.async,a.username,a.password),a.xhrFields)for(e in a.xhrFields)f[e]=a.xhrFields[e];a.mimeType&&f.overrideMimeType&&f.overrideMimeType(a.mimeType),a.crossDomain||c["X-Requested-With"]||(c["X-Requested-With"]="XMLHttpRequest");for(e in c)void 0!==c[e]&&f.setRequestHeader(e,c[e]+"");f.send(a.hasContent&&a.data||null),b=function(c,e){var h,i,j;if(b&&(e||4===f.readyState))if(delete Xc[g],b=void 0,f.onreadystatechange=m.noop,e)4!==f.readyState&&f.abort();else{j={},h=f.status,"string"==typeof f.responseText&&(j.text=f.responseText);try{i=f.statusText}catch(k){i=""}h||!a.isLocal||a.crossDomain?1223===h&&(h=204):h=j.text?200:404}j&&d(h,i,j,f.getAllResponseHeaders())},a.async?4===f.readyState?setTimeout(b):f.onreadystatechange=Xc[g]=b:b()},abort:function(){b&&b(void 0,!0)}}}});function Zc(){try{return new a.XMLHttpRequest}catch(b){}}function $c(){try{return new a.ActiveXObject("Microsoft.XMLHTTP")}catch(b){}}m.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/(?:java|ecma)script/},converters:{"text script":function(a){return m.globalEval(a),a}}}),m.ajaxPrefilter("script",function(a){void 0===a.cache&&(a.cache=!1),a.crossDomain&&(a.type="GET",a.global=!1)}),m.ajaxTransport("script",function(a){if(a.crossDomain){var b,c=y.head||m("head")[0]||y.documentElement;return{send:function(d,e){b=y.createElement("script"),b.async=!0,a.scriptCharset&&(b.charset=a.scriptCharset),b.src=a.url,b.onload=b.onreadystatechange=function(a,c){(c||!b.readyState||/loaded|complete/.test(b.readyState))&&(b.onload=b.onreadystatechange=null,b.parentNode&&b.parentNode.removeChild(b),b=null,c||e(200,"success"))},c.insertBefore(b,c.firstChild)},abort:function(){b&&b.onload(void 0,!0)}}}});var _c=[],ad=/(=)\?(?=&|$)|\?\?/;m.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var a=_c.pop()||m.expando+"_"+vc++;return this[a]=!0,a}}),m.ajaxPrefilter("json jsonp",function(b,c,d){var e,f,g,h=b.jsonp!==!1&&(ad.test(b.url)?"url":"string"==typeof b.data&&!(b.contentType||"").indexOf("application/x-www-form-urlencoded")&&ad.test(b.data)&&"data");return h||"jsonp"===b.dataTypes[0]?(e=b.jsonpCallback=m.isFunction(b.jsonpCallback)?b.jsonpCallback():b.jsonpCallback,h?b[h]=b[h].replace(ad,"$1"+e):b.jsonp!==!1&&(b.url+=(wc.test(b.url)?"&":"?")+b.jsonp+"="+e),b.converters["script json"]=function(){return g||m.error(e+" was not called"),g[0]},b.dataTypes[0]="json",f=a[e],a[e]=function(){g=arguments},d.always(function(){a[e]=f,b[e]&&(b.jsonpCallback=c.jsonpCallback,_c.push(e)),g&&m.isFunction(f)&&f(g[0]),g=f=void 0}),"script"):void 0}),m.parseHTML=function(a,b,c){if(!a||"string"!=typeof a)return null;"boolean"==typeof b&&(c=b,b=!1),b=b||y;var d=u.exec(a),e=!c&&[];return d?[b.createElement(d[1])]:(d=m.buildFragment([a],b,e),e&&e.length&&m(e).remove(),m.merge([],d.childNodes))};var bd=m.fn.load;m.fn.load=function(a,b,c){if("string"!=typeof a&&bd)return bd.apply(this,arguments);var d,e,f,g=this,h=a.indexOf(" ");return h>=0&&(d=m.trim(a.slice(h,a.length)),a=a.slice(0,h)),m.isFunction(b)?(c=b,b=void 0):b&&"object"==typeof b&&(f="POST"),g.length>0&&m.ajax({url:a,type:f,dataType:"html",data:b}).done(function(a){e=arguments,g.html(d?m("<div>").append(m.parseHTML(a)).find(d):a)}).complete(c&&function(a,b){g.each(c,e||[a.responseText,b,a])}),this},m.expr.filters.animated=function(a){return m.grep(m.timers,function(b){return a===b.elem}).length};var cd=a.document.documentElement;function dd(a){return m.isWindow(a)?a:9===a.nodeType?a.defaultView||a.parentWindow:!1}m.offset={setOffset:function(a,b,c){var d,e,f,g,h,i,j,k=m.css(a,"position"),l=m(a),n={};"static"===k&&(a.style.position="relative"),h=l.offset(),f=m.css(a,"top"),i=m.css(a,"left"),j=("absolute"===k||"fixed"===k)&&m.inArray("auto",[f,i])>-1,j?(d=l.position(),g=d.top,e=d.left):(g=parseFloat(f)||0,e=parseFloat(i)||0),m.isFunction(b)&&(b=b.call(a,c,h)),null!=b.top&&(n.top=b.top-h.top+g),null!=b.left&&(n.left=b.left-h.left+e),"using"in b?b.using.call(a,n):l.css(n)}},m.fn.extend({offset:function(a){if(arguments.length)return void 0===a?this:this.each(function(b){m.offset.setOffset(this,a,b)});var b,c,d={top:0,left:0},e=this[0],f=e&&e.ownerDocument;if(f)return b=f.documentElement,m.contains(b,e)?(typeof e.getBoundingClientRect!==K&&(d=e.getBoundingClientRect()),c=dd(f),{top:d.top+(c.pageYOffset||b.scrollTop)-(b.clientTop||0),left:d.left+(c.pageXOffset||b.scrollLeft)-(b.clientLeft||0)}):d},position:function(){if(this[0]){var a,b,c={top:0,left:0},d=this[0];return"fixed"===m.css(d,"position")?b=d.getBoundingClientRect():(a=this.offsetParent(),b=this.offset(),m.nodeName(a[0],"html")||(c=a.offset()),c.top+=m.css(a[0],"borderTopWidth",!0),c.left+=m.css(a[0],"borderLeftWidth",!0)),{top:b.top-c.top-m.css(d,"marginTop",!0),left:b.left-c.left-m.css(d,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var a=this.offsetParent||cd;while(a&&!m.nodeName(a,"html")&&"static"===m.css(a,"position"))a=a.offsetParent;return a||cd})}}),m.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(a,b){var c=/Y/.test(b);m.fn[a]=function(d){return V(this,function(a,d,e){var f=dd(a);return void 0===e?f?b in f?f[b]:f.document.documentElement[d]:a[d]:void(f?f.scrollTo(c?m(f).scrollLeft():e,c?e:m(f).scrollTop()):a[d]=e)},a,d,arguments.length,null)}}),m.each(["top","left"],function(a,b){m.cssHooks[b]=Lb(k.pixelPosition,function(a,c){return c?(c=Jb(a,b),Hb.test(c)?m(a).position()[b]+"px":c):void 0})}),m.each({Height:"height",Width:"width"},function(a,b){m.each({padding:"inner"+a,content:b,"":"outer"+a},function(c,d){m.fn[d]=function(d,e){var f=arguments.length&&(c||"boolean"!=typeof d),g=c||(d===!0||e===!0?"margin":"border");return V(this,function(b,c,d){var e;return m.isWindow(b)?b.document.documentElement["client"+a]:9===b.nodeType?(e=b.documentElement,Math.max(b.body["scroll"+a],e["scroll"+a],b.body["offset"+a],e["offset"+a],e["client"+a])):void 0===d?m.css(b,c,g):m.style(b,c,d,g)},b,f?d:void 0,f,null)}})}),m.fn.size=function(){return this.length},m.fn.andSelf=m.fn.addBack,"function"==typeof define&&define.amd&&define("jquery",[],function(){return m});var ed=a.jQuery,fd=a.$;return m.noConflict=function(b){return a.$===m&&(a.$=fd),b&&a.jQuery===m&&(a.jQuery=ed),m},typeof b===K&&(a.jQuery=a.$=m),m});
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index 11fd956bfbe66..445110d63e184 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -81,7 +81,9 @@ table.sortable thead {
 
 span.kill-link {
   margin-right: 2px;
+  margin-left: 20px;
   color: gray;
+  float: right;
 }
 
 span.kill-link a {
@@ -112,3 +114,8 @@ pre {
   padding-bottom: 0;
   border: none;
 }
+
+.tooltip {
+ font-weight: normal;
+}
+
diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index 59fdf659c9e11..79c9c451d273d 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -55,22 +55,24 @@ case class Aggregator[K, V, C] (
       combiners.iterator
     } else {
       val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
-      while (iter.hasNext) {
-        val (k, v) = iter.next()
-        combiners.insert(k, v)
+      combiners.insertAll(iter)
+      // Update task metrics if context is not null
+      // TODO: Make context non optional in a future release
+      Option(context).foreach { c =>
+        c.taskMetrics.memoryBytesSpilled += combiners.memoryBytesSpilled
+        c.taskMetrics.diskBytesSpilled += combiners.diskBytesSpilled
       }
-      // TODO: Make this non optional in a future release
-      Option(context).foreach(c => c.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled)
-      Option(context).foreach(c => c.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled)
       combiners.iterator
     }
   }
 
   @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
-  def combineCombinersByKey(iter: Iterator[(K, C)]) : Iterator[(K, C)] =
+  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] =
     combineCombinersByKey(iter, null)
 
-  def combineCombinersByKey(iter: Iterator[(K, C)], context: TaskContext) : Iterator[(K, C)] = {
+  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
+      : Iterator[(K, C)] =
+  {
     if (!externalSorting) {
       val combiners = new AppendOnlyMap[K,C]
       var kc: Product2[K, C] = null
@@ -85,12 +87,15 @@ case class Aggregator[K, V, C] (
     } else {
       val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
       while (iter.hasNext) {
-        val (k, c) = iter.next()
-        combiners.insert(k, c)
+        val pair = iter.next()
+        combiners.insert(pair._1, pair._2)
+      }
+      // Update task metrics if context is not null
+      // TODO: Make context non-optional in a future release
+      Option(context).foreach { c =>
+        c.taskMetrics.memoryBytesSpilled += combiners.memoryBytesSpilled
+        c.taskMetrics.diskBytesSpilled += combiners.diskBytesSpilled
       }
-      // TODO: Make this non optional in a future release
-      Option(context).foreach(c => c.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled)
-      Option(context).foreach(c => c.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled)
       combiners.iterator
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala
index 8f867686a0443..5ddda4d6953fa 100644
--- a/core/src/main/scala/org/apache/spark/CacheManager.scala
+++ b/core/src/main/scala/org/apache/spark/CacheManager.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark
 
-import scala.collection.mutable.{ArrayBuffer, HashSet}
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.executor.InputMetrics
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage._
 
@@ -30,7 +30,7 @@ import org.apache.spark.storage._
 private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
 
   /** Keys of RDD partitions that are being computed/loaded. */
-  private val loading = new HashSet[RDDBlockId]()
+  private val loading = new mutable.HashSet[RDDBlockId]
 
   /** Gets or computes an RDD partition. Used by RDD.iterator() when an RDD is cached. */
   def getOrCompute[T](
@@ -118,21 +118,29 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
   }
 
   /**
-   * Cache the values of a partition, keeping track of any updates in the storage statuses
-   * of other blocks along the way.
+   * Cache the values of a partition, keeping track of any updates in the storage statuses of
+   * other blocks along the way.
+   *
+   * The effective storage level refers to the level that actually specifies BlockManager put
+   * behavior, not the level originally specified by the user. This is mainly for forcing a
+   * MEMORY_AND_DISK partition to disk if there is not enough room to unroll the partition,
+   * while preserving the the original semantics of the RDD as specified by the application.
    */
   private def putInBlockManager[T](
       key: BlockId,
       values: Iterator[T],
-      storageLevel: StorageLevel,
-      updatedBlocks: ArrayBuffer[(BlockId, BlockStatus)]): Iterator[T] = {
-
-    if (!storageLevel.useMemory) {
-      /* This RDD is not to be cached in memory, so we can just pass the computed values
-       * as an iterator directly to the BlockManager, rather than first fully unrolling
-       * it in memory. The latter option potentially uses much more memory and risks OOM
-       * exceptions that can be avoided. */
-      updatedBlocks ++= blockManager.put(key, values, storageLevel, tellMaster = true)
+      level: StorageLevel,
+      updatedBlocks: ArrayBuffer[(BlockId, BlockStatus)],
+      effectiveStorageLevel: Option[StorageLevel] = None): Iterator[T] = {
+
+    val putLevel = effectiveStorageLevel.getOrElse(level)
+    if (!putLevel.useMemory) {
+      /*
+       * This RDD is not to be cached in memory, so we can just pass the computed values as an
+       * iterator directly to the BlockManager rather than first fully unrolling it in memory.
+       */
+      updatedBlocks ++=
+        blockManager.putIterator(key, values, level, tellMaster = true, effectiveStorageLevel)
       blockManager.get(key) match {
         case Some(v) => v.data.asInstanceOf[Iterator[T]]
         case None =>
@@ -140,14 +148,36 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
           throw new BlockException(key, s"Block manager failed to return cached value for $key!")
       }
     } else {
-      /* This RDD is to be cached in memory. In this case we cannot pass the computed values
+      /*
+       * This RDD is to be cached in memory. In this case we cannot pass the computed values
        * to the BlockManager as an iterator and expect to read it back later. This is because
-       * we may end up dropping a partition from memory store before getting it back, e.g.
-       * when the entirety of the RDD does not fit in memory. */
-      val elements = new ArrayBuffer[Any]
-      elements ++= values
-      updatedBlocks ++= blockManager.put(key, elements, storageLevel, tellMaster = true)
-      elements.iterator.asInstanceOf[Iterator[T]]
+       * we may end up dropping a partition from memory store before getting it back.
+       *
+       * In addition, we must be careful to not unroll the entire partition in memory at once.
+       * Otherwise, we may cause an OOM exception if the JVM does not have enough space for this
+       * single partition. Instead, we unroll the values cautiously, potentially aborting and
+       * dropping the partition to disk if applicable.
+       */
+      blockManager.memoryStore.unrollSafely(key, values, updatedBlocks) match {
+        case Left(arr) =>
+          // We have successfully unrolled the entire partition, so cache it in memory
+          updatedBlocks ++=
+            blockManager.putArray(key, arr, level, tellMaster = true, effectiveStorageLevel)
+          arr.iterator.asInstanceOf[Iterator[T]]
+        case Right(it) =>
+          // There is not enough space to cache this partition in memory
+          logWarning(s"Not enough space to cache partition $key in memory! " +
+            s"Free memory is ${blockManager.memoryStore.freeMemory} bytes.")
+          val returnValues = it.asInstanceOf[Iterator[T]]
+          if (putLevel.useDisk) {
+            logWarning(s"Persisting partition $key to disk instead.")
+            val diskOnlyLevel = StorageLevel(useDisk = true, useMemory = false,
+              useOffHeap = false, deserialized = false, putLevel.replication)
+            putInBlockManager[T](key, returnValues, level, updatedBlocks, Some(diskOnlyLevel))
+          } else {
+            returnValues
+          }
+      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
index 09a60571238ea..ab2594cfc02eb 100644
--- a/core/src/main/scala/org/apache/spark/Dependency.scala
+++ b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -27,29 +27,35 @@ import org.apache.spark.shuffle.ShuffleHandle
  * Base class for dependencies.
  */
 @DeveloperApi
-abstract class Dependency[T](val rdd: RDD[T]) extends Serializable
+abstract class Dependency[T] extends Serializable {
+  def rdd: RDD[T]
+}
 
 
 /**
  * :: DeveloperApi ::
- * Base class for dependencies where each partition of the parent RDD is used by at most one
- * partition of the child RDD.  Narrow dependencies allow for pipelined execution.
+ * Base class for dependencies where each partition of the child RDD depends on a small number
+ * of partitions of the parent RDD. Narrow dependencies allow for pipelined execution.
  */
 @DeveloperApi
-abstract class NarrowDependency[T](rdd: RDD[T]) extends Dependency(rdd) {
+abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
   /**
    * Get the parent partitions for a child partition.
    * @param partitionId a partition of the child RDD
    * @return the partitions of the parent RDD that the child partition depends upon
    */
   def getParents(partitionId: Int): Seq[Int]
+
+  override def rdd: RDD[T] = _rdd
 }
 
 
 /**
  * :: DeveloperApi ::
- * Represents a dependency on the output of a shuffle stage.
- * @param rdd the parent RDD
+ * Represents a dependency on the output of a shuffle stage. Note that in the case of shuffle,
+ * the RDD is transient since we don't need it on the executor side.
+ *
+ * @param _rdd the parent RDD
  * @param partitioner partitioner used to partition the shuffle output
  * @param serializer [[org.apache.spark.serializer.Serializer Serializer]] to use. If set to None,
  *                   the default serializer, as specified by `spark.serializer` config option, will
@@ -57,20 +63,22 @@ abstract class NarrowDependency[T](rdd: RDD[T]) extends Dependency(rdd) {
  */
 @DeveloperApi
 class ShuffleDependency[K, V, C](
-    @transient rdd: RDD[_ <: Product2[K, V]],
+    @transient _rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
     val serializer: Option[Serializer] = None,
     val keyOrdering: Option[Ordering[K]] = None,
     val aggregator: Option[Aggregator[K, V, C]] = None,
     val mapSideCombine: Boolean = false)
-  extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) {
+  extends Dependency[Product2[K, V]] {
+
+  override def rdd = _rdd.asInstanceOf[RDD[Product2[K, V]]]
 
-  val shuffleId: Int = rdd.context.newShuffleId()
+  val shuffleId: Int = _rdd.context.newShuffleId()
 
-  val shuffleHandle: ShuffleHandle = rdd.context.env.shuffleManager.registerShuffle(
-    shuffleId, rdd.partitions.size, this)
+  val shuffleHandle: ShuffleHandle = _rdd.context.env.shuffleManager.registerShuffle(
+    shuffleId, _rdd.partitions.size, this)
 
-  rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
+  _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
new file mode 100644
index 0000000000000..24ccce21b62ca
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import akka.actor.Actor
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.scheduler.TaskScheduler
+
+/**
+ * A heartbeat from executors to the driver. This is a shared message used by several internal
+ * components to convey liveness or execution information for in-progress tasks.
+ */
+private[spark] case class Heartbeat(
+    executorId: String,
+    taskMetrics: Array[(Long, TaskMetrics)], // taskId -> TaskMetrics
+    blockManagerId: BlockManagerId)
+
+private[spark] case class HeartbeatResponse(reregisterBlockManager: Boolean)
+
+/**
+ * Lives in the driver to receive heartbeats from executors..
+ */
+private[spark] class HeartbeatReceiver(scheduler: TaskScheduler) extends Actor {
+  override def receive = {
+    case Heartbeat(executorId, taskMetrics, blockManagerId) =>
+      val response = HeartbeatResponse(
+        !scheduler.executorHeartbeatReceived(executorId, taskMetrics, blockManagerId))
+      sender ! response
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index 50d8e93e1f0d7..d4f2624061e35 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -39,16 +39,17 @@ trait Logging {
   // be serialized and used on another machine
   @transient private var log_ : Logger = null
 
+  // Method to get the logger name for this object
+  protected def logName = {
+    // Ignore trailing $'s in the class names for Scala objects
+    this.getClass.getName.stripSuffix("$")
+  }
+
   // Method to get or create the logger for this object
   protected def log: Logger = {
     if (log_ == null) {
       initializeIfNecessary()
-      var className = this.getClass.getName
-      // Ignore trailing $'s in the class names for Scala objects
-      if (className.endsWith("$")) {
-        className = className.substring(0, className.length - 1)
-      }
-      log_ = LoggerFactory.getLogger(className)
+      log_ = LoggerFactory.getLogger(logName)
     }
     log_
   }
@@ -110,23 +111,27 @@ trait Logging {
   }
 
   private def initializeLogging() {
-    // If Log4j is being used, but is not initialized, load a default properties file
-    val binder = StaticLoggerBinder.getSingleton
-    val usingLog4j = binder.getLoggerFactoryClassStr.endsWith("Log4jLoggerFactory")
-    val log4jInitialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
-    if (!log4jInitialized && usingLog4j) {
+    // Don't use a logger in here, as this is itself occurring during initialization of a logger
+    // If Log4j 1.2 is being used, but is not initialized, load a default properties file
+    val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr
+    // This distinguishes the log4j 1.2 binding, currently
+    // org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently
+    // org.apache.logging.slf4j.Log4jLoggerFactory
+    val usingLog4j12 = "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass)
+    val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
+    if (!log4j12Initialized && usingLog4j12) {
       val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
       Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
         case Some(url) =>
           PropertyConfigurator.configure(url)
-          log.info(s"Using Spark's default log4j profile: $defaultLogProps")
+          System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
         case None =>
           System.err.println(s"Spark was unable to load $defaultLogProps")
       }
     }
     Logging.initialized = true
 
-    // Force a call into slf4j to initialize it. Avoids this happening from mutliple threads
+    // Force a call into slf4j to initialize it. Avoids this happening from multiple threads
     // and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html
     log
   }
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index ec99648a8488a..37053bb6f37ad 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -19,11 +19,15 @@ package org.apache.spark
 
 import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
-import scala.reflect.ClassTag
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.{ClassTag, classTag}
+import scala.util.hashing.byteswap32
 
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{PartitionPruningRDD, RDD}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.util.{CollectionsUtils, Utils}
+import org.apache.spark.util.random.{XORShiftRandom, SamplingUtils}
 
 /**
  * An object that defines how the elements in a key-value pair RDD are partitioned by key.
@@ -103,26 +107,49 @@ class RangePartitioner[K : Ordering : ClassTag, V](
     private var ascending: Boolean = true)
   extends Partitioner {
 
+  // We allow partitions = 0, which happens when sorting an empty RDD under the default settings.
+  require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions.")
+
   private var ordering = implicitly[Ordering[K]]
 
   // An array of upper bounds for the first (partitions - 1) partitions
   private var rangeBounds: Array[K] = {
-    if (partitions == 1) {
-      Array()
+    if (partitions <= 1) {
+      Array.empty
     } else {
-      val rddSize = rdd.count()
-      val maxSampleSize = partitions * 20.0
-      val frac = math.min(maxSampleSize / math.max(rddSize, 1), 1.0)
-      val rddSample = rdd.sample(false, frac, 1).map(_._1).collect().sorted
-      if (rddSample.length == 0) {
-        Array()
+      // This is the sample size we need to have roughly balanced output partitions, capped at 1M.
+      val sampleSize = math.min(20.0 * partitions, 1e6)
+      // Assume the input partitions are roughly balanced and over-sample a little bit.
+      val sampleSizePerPartition = math.ceil(3.0 * sampleSize / rdd.partitions.size).toInt
+      val (numItems, sketched) = RangePartitioner.sketch(rdd.map(_._1), sampleSizePerPartition)
+      if (numItems == 0L) {
+        Array.empty
       } else {
-        val bounds = new Array[K](partitions - 1)
-        for (i <- 0 until partitions - 1) {
-          val index = (rddSample.length - 1) * (i + 1) / partitions
-          bounds(i) = rddSample(index)
+        // If a partition contains much more than the average number of items, we re-sample from it
+        // to ensure that enough items are collected from that partition.
+        val fraction = math.min(sampleSize / math.max(numItems, 1L), 1.0)
+        val candidates = ArrayBuffer.empty[(K, Float)]
+        val imbalancedPartitions = mutable.Set.empty[Int]
+        sketched.foreach { case (idx, n, sample) =>
+          if (fraction * n > sampleSizePerPartition) {
+            imbalancedPartitions += idx
+          } else {
+            // The weight is 1 over the sampling probability.
+            val weight = (n.toDouble / sample.size).toFloat
+            for (key <- sample) {
+              candidates += ((key, weight))
+            }
+          }
+        }
+        if (imbalancedPartitions.nonEmpty) {
+          // Re-sample imbalanced partitions with the desired sampling probability.
+          val imbalanced = new PartitionPruningRDD(rdd.map(_._1), imbalancedPartitions.contains)
+          val seed = byteswap32(-rdd.id - 1)
+          val reSampled = imbalanced.sample(withReplacement = false, fraction, seed).collect()
+          val weight = (1.0 / fraction).toFloat
+          candidates ++= reSampled.map(x => (x, weight))
         }
-        bounds
+        RangePartitioner.determineBounds(candidates, partitions)
       }
     }
   }
@@ -134,8 +161,8 @@ class RangePartitioner[K : Ordering : ClassTag, V](
   def getPartition(key: Any): Int = {
     val k = key.asInstanceOf[K]
     var partition = 0
-    if (rangeBounds.length < 1000) {
-      // If we have less than 100 partitions naive search
+    if (rangeBounds.length <= 128) {
+      // If we have less than 128 partitions naive search
       while (partition < rangeBounds.length && ordering.gt(k, rangeBounds(partition))) {
         partition += 1
       }
@@ -212,3 +239,67 @@ class RangePartitioner[K : Ordering : ClassTag, V](
     }
   }
 }
+
+private[spark] object RangePartitioner {
+
+  /**
+   * Sketches the input RDD via reservoir sampling on each partition.
+   *
+   * @param rdd the input RDD to sketch
+   * @param sampleSizePerPartition max sample size per partition
+   * @return (total number of items, an array of (partitionId, number of items, sample))
+   */
+  def sketch[K:ClassTag](
+      rdd: RDD[K],
+      sampleSizePerPartition: Int): (Long, Array[(Int, Int, Array[K])]) = {
+    val shift = rdd.id
+    // val classTagK = classTag[K] // to avoid serializing the entire partitioner object
+    val sketched = rdd.mapPartitionsWithIndex { (idx, iter) =>
+      val seed = byteswap32(idx ^ (shift << 16))
+      val (sample, n) = SamplingUtils.reservoirSampleAndCount(
+        iter, sampleSizePerPartition, seed)
+      Iterator((idx, n, sample))
+    }.collect()
+    val numItems = sketched.map(_._2.toLong).sum
+    (numItems, sketched)
+  }
+
+  /**
+   * Determines the bounds for range partitioning from candidates with weights indicating how many
+   * items each represents. Usually this is 1 over the probability used to sample this candidate.
+   *
+   * @param candidates unordered candidates with weights
+   * @param partitions number of partitions
+   * @return selected bounds
+   */
+  def determineBounds[K:Ordering:ClassTag](
+      candidates: ArrayBuffer[(K, Float)],
+      partitions: Int): Array[K] = {
+    val ordering = implicitly[Ordering[K]]
+    val ordered = candidates.sortBy(_._1)
+    val numCandidates = ordered.size
+    val sumWeights = ordered.map(_._2.toDouble).sum
+    val step = sumWeights / partitions
+    var cumWeight = 0.0
+    var target = step
+    val bounds = ArrayBuffer.empty[K]
+    var i = 0
+    var j = 0
+    var previousBound = Option.empty[K]
+    while ((i < numCandidates) && (j < partitions - 1)) {
+      val (key, weight) = ordered(i)
+      cumWeight += weight
+      if (cumWeight > target) {
+        // Skip duplicate values.
+        if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) {
+          bounds += key
+          target += step
+          j += 1
+          previousBound = Some(key)
+        }
+      }
+      i += 1
+    }
+    bounds.toArray
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 8ce4b91cae8ae..38700847c80f4 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -40,6 +40,8 @@ import scala.collection.mutable.HashMap
  */
 class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
 
+  import SparkConf._
+
   /** Create a SparkConf that loads defaults from system properties and the classpath */
   def this() = this(true)
 
@@ -198,7 +200,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
      *
      *   E.g. spark.akka.option.x.y.x = "value"
      */
-    getAll.filter {case (k, v) => k.startsWith("akka.")}
+    getAll.filter { case (k, _) => isAkkaConf(k) }
 
   /** Does the configuration contain a given parameter? */
   def contains(key: String): Boolean = settings.contains(key)
@@ -292,3 +294,21 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     settings.toArray.sorted.map{case (k, v) => k + "=" + v}.mkString("\n")
   }
 }
+
+private[spark] object SparkConf {
+  /**
+   * Return whether the given config is an akka config (e.g. akka.actor.provider).
+   * Note that this does not include spark-specific akka configs (e.g. spark.akka.timeout).
+   */
+  def isAkkaConf(name: String): Boolean = name.startsWith("akka.")
+
+  /**
+   * Return whether the given config should be passed to an executor on start-up.
+   *
+   * Certain akka and authentication configs are required of the executor when it connects to
+   * the scheduler, while the rest of the spark configs can be inherited from the driver later.
+   */
+  def isExecutorStartupConf(name: String): Boolean = {
+    isAkkaConf(name) || name.startsWith("spark.akka") || name.startsWith("spark.auth")
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 09736be47fa65..e132955f0f850 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -36,6 +36,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, Sequence
 import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 import org.apache.mesos.MesosNativeLibrary
+import akka.actor.Props
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
@@ -47,7 +48,7 @@ import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SparkDeploySchedulerBackend, SimrSchedulerBackend}
 import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend}
 import org.apache.spark.scheduler.local.LocalBackend
-import org.apache.spark.storage.{BlockManagerSource, RDDInfo, StorageStatus, StorageUtils}
+import org.apache.spark.storage._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{CallSite, ClosureCleaner, MetadataCleaner, MetadataCleanerType, TimeStampedWeakValueHashMap, Utils}
 
@@ -289,7 +290,7 @@ class SparkContext(config: SparkConf) extends Logging {
     value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
     executorEnvs(envKey) = value
   }
-  Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v => 
+  Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v =>
     executorEnvs("SPARK_PREPEND_CLASSES") = v
   }
   // The Mesos scheduler backend relies on this environment variable to set executor memory.
@@ -307,6 +308,8 @@ class SparkContext(config: SparkConf) extends Logging {
 
   // Create and start the scheduler
   private[spark] var taskScheduler = SparkContext.createTaskScheduler(this, master)
+  private val heartbeatReceiver = env.actorSystem.actorOf(
+    Props(new HeartbeatReceiver(taskScheduler)), "HeartbeatReceiver")
   @volatile private[spark] var dagScheduler: DAGScheduler = _
   try {
     dagScheduler = new DAGScheduler(this)
@@ -455,7 +458,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /** Distribute a local Scala collection to form an RDD, with one or more
     * location preferences (hostnames of Spark nodes) for each object.
     * Create a new partition for each collection item. */
-   def makeRDD[T: ClassTag](seq: Seq[(T, Seq[String])]): RDD[T] = {
+  def makeRDD[T: ClassTag](seq: Seq[(T, Seq[String])]): RDD[T] = {
     val indexToPrefs = seq.zipWithIndex.map(t => (t._2, t._1._2)).toMap
     new ParallelCollectionRDD[T](this, seq.map(_._1), seq.size, indexToPrefs)
   }
@@ -859,7 +862,9 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   @DeveloperApi
   def getRDDStorageInfo: Array[RDDInfo] = {
-    StorageUtils.rddInfoFromStorageStatus(getExecutorStorageStatus, this)
+    val rddInfos = persistentRdds.values.map(RDDInfo.fromRdd).toArray
+    StorageUtils.updateRddInfo(rddInfos, getExecutorStorageStatus)
+    rddInfos.filter(_.isCached)
   }
 
   /**
@@ -1009,15 +1014,15 @@ class SparkContext(config: SparkConf) extends Logging {
     val dagSchedulerCopy = dagScheduler
     dagScheduler = null
     if (dagSchedulerCopy != null) {
+      env.metricsSystem.report()
       metadataCleaner.cancel()
+      env.actorSystem.stop(heartbeatReceiver)
       cleaner.foreach(_.stop())
       dagSchedulerCopy.stop()
       taskScheduler = null
       // TODO: Cache.stop()?
       env.stop()
       SparkEnv.set(null)
-      ShuffleMapTask.clearCache()
-      ResultTask.clearCache()
       listenerBus.stop()
       eventLogger.foreach(_.stop())
       logInfo("Successfully stopped SparkContext")
@@ -1056,7 +1061,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   private[spark] def getCallSite(): CallSite = {
     Option(getLocalProperty("externalCallSite")) match {
-      case Some(callSite) => CallSite(callSite, long = "")
+      case Some(callSite) => CallSite(callSite, longForm = "")
       case None => Utils.getCallSite
     }
   }
@@ -1078,11 +1083,12 @@ class SparkContext(config: SparkConf) extends Logging {
     }
     val callSite = getCallSite
     val cleanedFunc = clean(func)
-    logInfo("Starting job: " + callSite.short)
+    logInfo("Starting job: " + callSite.shortForm)
     val start = System.nanoTime
     dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
       resultHandler, localProperties.get)
-    logInfo("Job finished: " + callSite.short + ", took " + (System.nanoTime - start) / 1e9 + " s")
+    logInfo(
+      "Job finished: " + callSite.shortForm + ", took " + (System.nanoTime - start) / 1e9 + " s")
     rdd.doCheckpoint()
   }
 
@@ -1163,11 +1169,12 @@ class SparkContext(config: SparkConf) extends Logging {
       evaluator: ApproximateEvaluator[U, R],
       timeout: Long): PartialResult[R] = {
     val callSite = getCallSite
-    logInfo("Starting job: " + callSite.short)
+    logInfo("Starting job: " + callSite.shortForm)
     val start = System.nanoTime
     val result = dagScheduler.runApproximateJob(rdd, func, evaluator, callSite, timeout,
       localProperties.get)
-    logInfo("Job finished: " + callSite.short + ", took " + (System.nanoTime - start) / 1e9 + " s")
+    logInfo(
+      "Job finished: " + callSite.shortForm + ", took " + (System.nanoTime - start) / 1e9 + " s")
     result
   }
 
@@ -1222,10 +1229,10 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Clean a closure to make it ready to serialized and send to tasks
    * (removes unreferenced variables in $outer's, updates REPL variables)
-   * If <tt>checkSerializable</tt> is set, <tt>clean</tt> will also proactively 
-   * check to see if <tt>f</tt> is serializable and throw a <tt>SparkException</tt> 
+   * If <tt>checkSerializable</tt> is set, <tt>clean</tt> will also proactively
+   * check to see if <tt>f</tt> is serializable and throw a <tt>SparkException</tt>
    * if not.
-   * 
+   *
    * @param f the closure to clean
    * @param checkSerializable whether or not to immediately check <tt>f</tt> for serializability
    * @throws <tt>SparkException<tt> if <tt>checkSerializable</tt> is set but <tt>f</tt> is not
@@ -1470,9 +1477,9 @@ object SparkContext extends Logging {
   /** Creates a task scheduler based on a given master URL. Extracted for testing. */
   private def createTaskScheduler(sc: SparkContext, master: String): TaskScheduler = {
     // Regular expression used for local[N] and local[*] master formats
-    val LOCAL_N_REGEX = """local\[([0-9\*]+)\]""".r
+    val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
     // Regular expression for local[N, maxRetries], used in tests with failing tasks
-    val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+)\s*,\s*([0-9]+)\]""".r
+    val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
     // Regular expression for simulating a Spark cluster of [N, cores, memory] locally
     val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
     // Regular expression for connecting to Spark deploy clusters
@@ -1502,8 +1509,12 @@ object SparkContext extends Logging {
         scheduler
 
       case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
+        def localCpuCount = Runtime.getRuntime.availableProcessors()
+        // local[*, M] means the number of cores on the computer with M failures
+        // local[N, M] means exactly N threads with M failures
+        val threadCount = if (threads == "*") localCpuCount else threads.toInt
         val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
-        val backend = new LocalBackend(scheduler, threads.toInt)
+        val backend = new LocalBackend(scheduler, threadCount)
         scheduler.initialize(backend)
         scheduler
 
@@ -1550,7 +1561,16 @@ object SparkContext extends Logging {
             throw new SparkException("YARN mode not available ?", e)
           }
         }
-        val backend = new CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
+        val backend = try {
+          val clazz =
+            Class.forName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")
+          val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
+          cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
+        } catch {
+          case e: Exception => {
+            throw new SparkException("YARN mode not available ?", e)
+          }
+        }
         scheduler.initialize(backend)
         scheduler
 
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 2b636b085d73a..0bce531aaba3e 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.io.File
+import java.net.Socket
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
@@ -67,7 +68,7 @@ class SparkEnv (
     val metricsSystem: MetricsSystem,
     val conf: SparkConf) extends Logging {
 
-  // A mapping of thread ID to amount of memory used for shuffle in bytes
+  // A mapping of thread ID to amount of memory, in bytes, used for shuffle aggregations
   // All accesses should be manually synchronized
   val shuffleMemoryMap = mutable.HashMap[Long, Long]()
 
@@ -79,7 +80,7 @@ class SparkEnv (
 
   private[spark] def stop() {
     pythonWorkers.foreach { case(key, worker) => worker.stop() }
-    httpFileServer.stop()
+    Option(httpFileServer).foreach(_.stop())
     mapOutputTracker.stop()
     shuffleManager.stop()
     broadcastManager.stop()
@@ -102,10 +103,10 @@ class SparkEnv (
   }
 
   private[spark]
-  def destroyPythonWorker(pythonExec: String, envVars: Map[String, String]) {
+  def destroyPythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) {
     synchronized {
       val key = (pythonExec, envVars)
-      pythonWorkers(key).stop()
+      pythonWorkers.get(key).foreach(_.stopWorker(worker))
     }
   }
 }
@@ -193,13 +194,7 @@ object SparkEnv extends Logging {
         logInfo("Registering " + name)
         actorSystem.actorOf(Props(newActor), name = name)
       } else {
-        val driverHost: String = conf.get("spark.driver.host", "localhost")
-        val driverPort: Int = conf.getInt("spark.driver.port", 7077)
-        Utils.checkHost(driverHost, "Expected hostname")
-        val url = s"akka.tcp://spark@$driverHost:$driverPort/user/$name"
-        val timeout = AkkaUtils.lookupTimeout(conf)
-        logInfo(s"Connecting to $name: $url")
-        Await.result(actorSystem.actorSelection(url).resolveOne(timeout), timeout)
+        AkkaUtils.makeDriverRef(name, conf, actorSystem)
       }
     }
 
@@ -228,9 +223,15 @@ object SparkEnv extends Logging {
 
     val cacheManager = new CacheManager(blockManager)
 
-    val httpFileServer = new HttpFileServer(securityManager)
-    httpFileServer.initialize()
-    conf.set("spark.fileserver.uri",  httpFileServer.serverUri)
+    val httpFileServer =
+      if (isDriver) {
+        val server = new HttpFileServer(securityManager)
+        server.initialize()
+        conf.set("spark.fileserver.uri",  server.serverUri)
+        server
+      } else {
+        null
+      }
 
     val metricsSystem = if (isDriver) {
       MetricsSystem.createMetricsSystem("driver", conf, securityManager)
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index df42d679b4699..8f0c5e78416c2 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -88,10 +89,7 @@ case class ExceptionFailure(
     stackTrace: Array[StackTraceElement],
     metrics: Option[TaskMetrics])
   extends TaskFailedReason {
-  override def toErrorString: String = {
-    val stackTraceString = if (stackTrace == null) "null" else stackTrace.mkString("\n")
-    s"$className ($description}\n$stackTraceString"
-  }
+  override def toErrorString: String = Utils.exceptionString(className, description, stackTrace)
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 885c6829a2d72..8ca731038e528 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -92,8 +92,8 @@ private[spark] object TestUtils {
   def createCompiledClass(className: String, destDir: File, value: String = ""): File = {
     val compiler = ToolProvider.getSystemJavaCompiler
     val sourceFile = new JavaSourceFromString(className,
-      "public class " + className + " { @Override public String toString() { " +
-       "return \"" + value + "\";}}")
+      "public class " + className + " implements java.io.Serializable {" +
+      "  @Override public String toString() { return \"" + value + "\"; }}")
 
     // Calling this outputs a class file in pwd. It's easier to just rename the file than
     // build a custom FileManager that controls the output location.
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala
new file mode 100644
index 0000000000000..0ae0b4ec042e2
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.java
+
+import scala.collection.JavaConversions._
+import scala.reflect.ClassTag
+
+import org.apache.hadoop.mapred.InputSplit
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.java.JavaSparkContext._
+import org.apache.spark.api.java.function.{Function2 => JFunction2}
+import org.apache.spark.rdd.HadoopRDD
+
+@DeveloperApi
+class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
+    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
+  extends JavaPairRDD[K, V](rdd) {
+
+  /** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
+  @DeveloperApi
+  def mapPartitionsWithInputSplit[R](
+      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
+      preservesPartitioning: Boolean = false): JavaRDD[R] = {
+    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
+      preservesPartitioning)(fakeClassTag))(fakeClassTag)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala
new file mode 100644
index 0000000000000..ec4f3964d75e0
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaNewHadoopRDD.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.java
+
+import scala.collection.JavaConversions._
+import scala.reflect.ClassTag
+
+import org.apache.hadoop.mapreduce.InputSplit
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.java.JavaSparkContext._
+import org.apache.spark.api.java.function.{Function2 => JFunction2}
+import org.apache.spark.rdd.NewHadoopRDD
+
+@DeveloperApi
+class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
+    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
+  extends JavaPairRDD[K, V](rdd) {
+
+  /** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
+  @DeveloperApi
+  def mapPartitionsWithInputSplit[R](
+      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
+      preservesPartitioning: Boolean = false): JavaRDD[R] = {
+    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
+      preservesPartitioning)(fakeClassTag))(fakeClassTag)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 4f3081433a542..76d4193e96aea 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.api.java
 
-import java.util.{Comparator, List => JList}
+import java.util.{Comparator, List => JList, Map => JMap}
 import java.lang.{Iterable => JIterable}
 
 import scala.collection.JavaConversions._
@@ -122,13 +122,80 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    */
   def sample(withReplacement: Boolean, fraction: Double): JavaPairRDD[K, V] =
     sample(withReplacement, fraction, Utils.random.nextLong)
-    
+
   /**
    * Return a sampled subset of this RDD.
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaPairRDD[K, V] =
     new JavaPairRDD[K, V](rdd.sample(withReplacement, fraction, seed))
 
+  /**
+   * Return a subset of this RDD sampled by key (via stratified sampling).
+   *
+   * Create a sample of this RDD using variable sampling rates for different keys as specified by
+   * `fractions`, a key to sampling rate map.
+   *
+   * If `exact` is set to false, create the sample via simple random sampling, with one pass
+   * over the RDD, to produce a sample of size that's approximately equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values; otherwise, use additional passes over
+   * the RDD to create a sample size that's exactly equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values.
+   */
+  def sampleByKey(withReplacement: Boolean,
+      fractions: JMap[K, Double],
+      exact: Boolean,
+      seed: Long): JavaPairRDD[K, V] =
+    new JavaPairRDD[K, V](rdd.sampleByKey(withReplacement, fractions, exact, seed))
+
+  /**
+   * Return a subset of this RDD sampled by key (via stratified sampling).
+   *
+   * Create a sample of this RDD using variable sampling rates for different keys as specified by
+   * `fractions`, a key to sampling rate map.
+   *
+   * If `exact` is set to false, create the sample via simple random sampling, with one pass
+   * over the RDD, to produce a sample of size that's approximately equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values; otherwise, use additional passes over
+   * the RDD to create a sample size that's exactly equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values.
+   *
+   * Use Utils.random.nextLong as the default seed for the random number generator
+   */
+  def sampleByKey(withReplacement: Boolean,
+      fractions: JMap[K, Double],
+      exact: Boolean): JavaPairRDD[K, V] =
+    sampleByKey(withReplacement, fractions, exact, Utils.random.nextLong)
+
+  /**
+   * Return a subset of this RDD sampled by key (via stratified sampling).
+   *
+   * Create a sample of this RDD using variable sampling rates for different keys as specified by
+   * `fractions`, a key to sampling rate map.
+   *
+   * Produce a sample of size that's approximately equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values with one pass over the RDD via
+   * simple random sampling.
+   */
+  def sampleByKey(withReplacement: Boolean,
+      fractions: JMap[K, Double],
+      seed: Long): JavaPairRDD[K, V] =
+    sampleByKey(withReplacement, fractions, false, seed)
+
+  /**
+   * Return a subset of this RDD sampled by key (via stratified sampling).
+   *
+   * Create a sample of this RDD using variable sampling rates for different keys as specified by
+   * `fractions`, a key to sampling rate map.
+   *
+   * Produce a sample of size that's approximately equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values with one pass over the RDD via
+   * simple random sampling.
+   *
+   * Use Utils.random.nextLong as the default seed for the random number generator
+   */
+  def sampleByKey(withReplacement: Boolean, fractions: JMap[K, Double]): JavaPairRDD[K, V] =
+    sampleByKey(withReplacement, fractions, false, Utils.random.nextLong)
+
   /**
    * Return the union of this RDD and another one. Any identical elements will appear multiple
    * times (use `.distinct()` to eliminate them).
@@ -716,6 +783,17 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
     sortByKey(comp, ascending)
   }
 
+  /**
+   * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
+   * `collect` or `save` on the resulting RDD will return or output an ordered list of records
+   * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
+   * order of the keys).
+   */
+  def sortByKey(ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
+    val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
+    sortByKey(comp, ascending, numPartitions)
+  }
+
   /**
    * Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
    * `collect` or `save` on the resulting RDD will return or output an ordered list of records
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 1c83a611ea76a..e0a4815940db3 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -34,7 +34,7 @@ import org.apache.spark._
 import org.apache.spark.SparkContext.{DoubleAccumulatorParam, IntAccumulatorParam}
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, NewHadoopRDD, RDD}
 
 /**
  * A Java-friendly version of [[org.apache.spark.SparkContext]] that returns
@@ -112,6 +112,9 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
 
   def startTime: java.lang.Long = sc.startTime
 
+  /** The version of Spark on which this application is running. */
+  def version: String = sc.version
+
   /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
   def defaultParallelism: java.lang.Integer = sc.defaultParallelism
 
@@ -132,6 +135,13 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     sc.parallelize(JavaConversions.asScalaBuffer(list), numSlices)
   }
 
+  /** Get an RDD that has no partitions or elements. */
+  def emptyRDD[T]: JavaRDD[T] = {
+    implicit val ctag: ClassTag[T] = fakeClassTag
+    JavaRDD.fromRDD(new EmptyRDD[T](sc))
+  }
+
+
   /** Distribute a local Scala collection to form an RDD. */
   def parallelize[T](list: java.util.List[T]): JavaRDD[T] =
     parallelize(list, sc.defaultParallelism)
@@ -284,7 +294,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minPartitions))
+    val rdd = sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minPartitions)
+    new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
   }
 
   /**
@@ -304,7 +315,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass))
+    val rdd = sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass)
+    new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
   }
 
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat.
@@ -323,7 +335,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions))
+    val rdd = sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions)
+    new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
   }
 
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat
@@ -341,8 +354,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     ): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
     implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
-    new JavaPairRDD(sc.hadoopFile(path,
-      inputFormatClass, keyClass, valueClass))
+    val rdd = sc.hadoopFile(path, inputFormatClass, keyClass, valueClass)
+    new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
   }
 
   /**
@@ -362,7 +375,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     conf: Configuration): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(kClass)
     implicit val ctagV: ClassTag[V] = ClassTag(vClass)
-    new JavaPairRDD(sc.newAPIHadoopFile(path, fClass, kClass, vClass, conf))
+    val rdd = sc.newAPIHadoopFile(path, fClass, kClass, vClass, conf)
+    new JavaNewHadoopRDD(rdd.asInstanceOf[NewHadoopRDD[K, V]])
   }
 
   /**
@@ -381,7 +395,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     vClass: Class[V]): JavaPairRDD[K, V] = {
     implicit val ctagK: ClassTag[K] = ClassTag(kClass)
     implicit val ctagV: ClassTag[V] = ClassTag(vClass)
-    new JavaPairRDD(sc.newAPIHadoopRDD(conf, fClass, kClass, vClass))
+    val rdd = sc.newAPIHadoopRDD(conf, fClass, kClass, vClass)
+    new JavaNewHadoopRDD(rdd.asInstanceOf[NewHadoopRDD[K, V]])
   }
 
   /** Build the union of two or more RDDs. */
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala
index adaa1ef6cf9ff..f3b05e1243045 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.api.python
 
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SerializableWritable, SparkException}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io._
 import scala.util.{Failure, Success, Try}
@@ -31,13 +32,14 @@ import org.apache.spark.annotation.Experimental
  * transformation code by overriding the convert method.
  */
 @Experimental
-trait Converter[T, U] extends Serializable {
+trait Converter[T, + U] extends Serializable {
   def convert(obj: T): U
 }
 
 private[python] object Converter extends Logging {
 
-  def getInstance(converterClass: Option[String]): Converter[Any, Any] = {
+  def getInstance(converterClass: Option[String],
+                  defaultConverter: Converter[Any, Any]): Converter[Any, Any] = {
     converterClass.map { cc =>
       Try {
         val c = Class.forName(cc).newInstance().asInstanceOf[Converter[Any, Any]]
@@ -49,7 +51,7 @@ private[python] object Converter extends Logging {
           logError(s"Failed to load converter: $cc")
           throw err
       }
-    }.getOrElse { new DefaultConverter }
+    }.getOrElse { defaultConverter }
   }
 }
 
@@ -57,7 +59,9 @@ private[python] object Converter extends Logging {
  * A converter that handles conversion of common [[org.apache.hadoop.io.Writable]] objects.
  * Other objects are passed through without conversion.
  */
-private[python] class DefaultConverter extends Converter[Any, Any] {
+private[python] class WritableToJavaConverter(
+    conf: Broadcast[SerializableWritable[Configuration]],
+    batchSize: Int) extends Converter[Any, Any] {
 
   /**
    * Converts a [[org.apache.hadoop.io.Writable]] to the underlying primitive, String or
@@ -72,17 +76,30 @@ private[python] class DefaultConverter extends Converter[Any, Any] {
       case fw: FloatWritable => fw.get()
       case t: Text => t.toString
       case bw: BooleanWritable => bw.get()
-      case byw: BytesWritable => byw.getBytes
+      case byw: BytesWritable =>
+        val bytes = new Array[Byte](byw.getLength)
+        System.arraycopy(byw.getBytes(), 0, bytes, 0, byw.getLength)
+        bytes
       case n: NullWritable => null
-      case aw: ArrayWritable => aw.get().map(convertWritable(_))
-      case mw: MapWritable => mapAsJavaMap(mw.map { case (k, v) =>
-        (convertWritable(k), convertWritable(v))
-      }.toMap)
+      case aw: ArrayWritable =>
+        // Due to erasure, all arrays appear as Object[] and they get pickled to Python tuples.
+        // Since we can't determine element types for empty arrays, we will not attempt to
+        // convert to primitive arrays (which get pickled to Python arrays). Users may want
+        // write custom converters for arrays if they know the element types a priori.
+        aw.get().map(convertWritable(_))
+      case mw: MapWritable =>
+        val map = new java.util.HashMap[Any, Any]()
+        mw.foreach { case (k, v) =>
+          map.put(convertWritable(k), convertWritable(v))
+        }
+        map
+      case w: Writable =>
+        if (batchSize > 1) WritableUtils.clone(w, conf.value.value) else w
       case other => other
     }
   }
 
-  def convert(obj: Any): Any = {
+  override def convert(obj: Any): Any = {
     obj match {
       case writable: Writable =>
         convertWritable(writable)
@@ -92,6 +109,47 @@ private[python] class DefaultConverter extends Converter[Any, Any] {
   }
 }
 
+/**
+ * A converter that converts common types to [[org.apache.hadoop.io.Writable]]. Note that array
+ * types are not supported since the user needs to subclass [[org.apache.hadoop.io.ArrayWritable]]
+ * to set the type properly. See [[org.apache.spark.api.python.DoubleArrayWritable]] and
+ * [[org.apache.spark.api.python.DoubleArrayToWritableConverter]] for an example. They are used in
+ * PySpark RDD `saveAsNewAPIHadoopFile` doctest.
+ */
+private[python] class JavaToWritableConverter extends Converter[Any, Writable] {
+
+  /**
+   * Converts common data types to [[org.apache.hadoop.io.Writable]]. Note that array types are not
+   * supported out-of-the-box.
+   */
+  private def convertToWritable(obj: Any): Writable = {
+    import collection.JavaConversions._
+    obj match {
+      case i: java.lang.Integer => new IntWritable(i)
+      case d: java.lang.Double => new DoubleWritable(d)
+      case l: java.lang.Long => new LongWritable(l)
+      case f: java.lang.Float => new FloatWritable(f)
+      case s: java.lang.String => new Text(s)
+      case b: java.lang.Boolean => new BooleanWritable(b)
+      case aob: Array[Byte] => new BytesWritable(aob)
+      case null => NullWritable.get()
+      case map: java.util.Map[_, _] =>
+        val mapWritable = new MapWritable()
+        map.foreach { case (k, v) =>
+          mapWritable.put(convertToWritable(k), convertToWritable(v))
+        }
+        mapWritable
+      case other => throw new SparkException(
+        s"Data of type ${other.getClass.getName} cannot be used")
+    }
+  }
+
+  override def convert(obj: Any): Writable = obj match {
+    case writable: Writable => writable
+    case other => convertToWritable(other)
+  }
+}
+
 /** Utilities for working with Python objects <-> Hadoop-related objects */
 private[python] object PythonHadoopUtil {
 
@@ -118,7 +176,7 @@ private[python] object PythonHadoopUtil {
 
   /**
    * Converts an RDD of key-value pairs, where key and/or value could be instances of
-   * [[org.apache.hadoop.io.Writable]], into an RDD[(K, V)]
+   * [[org.apache.hadoop.io.Writable]], into an RDD of base types, or vice versa.
    */
   def convertRDD[K, V](rdd: RDD[(K, V)],
                        keyConverter: Converter[Any, Any],
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 462e09466bfa6..0b5322c6fb965 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -23,22 +23,25 @@ import java.nio.charset.Charset
 import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
 
 import scala.collection.JavaConversions._
+import scala.language.existentials
 import scala.reflect.ClassTag
-import scala.util.Try
+import scala.util.{Try, Success, Failure}
 
 import net.razorvine.pickle.{Pickler, Unpickler}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.mapred.{InputFormat, JobConf}
-import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
+import org.apache.hadoop.io.compress.CompressionCodec
+import org.apache.hadoop.mapred.{InputFormat, OutputFormat, JobConf}
+import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, OutputFormat => NewOutputFormat}
 import org.apache.spark._
+import org.apache.spark.SparkContext._
 import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
-private[spark] class PythonRDD[T: ClassTag](
-    parent: RDD[T],
+private[spark] class PythonRDD(
+    parent: RDD[_],
     command: Array[Byte],
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
@@ -57,6 +60,9 @@ private[spark] class PythonRDD[T: ClassTag](
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
     val startTime = System.currentTimeMillis
     val env = SparkEnv.get
+    val localdir = env.blockManager.diskBlockManager.localDirs.map(
+      f => f.getPath()).mkString(",")
+    envVars += ("SPARK_LOCAL_DIR" -> localdir) // it's also used in monitor thread
     val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap)
 
     // Start a thread to feed the process input from our parent's iterator
@@ -235,7 +241,7 @@ private[spark] class PythonRDD[T: ClassTag](
       if (!context.completed) {
         try {
           logWarning("Incomplete task interrupted: Attempting to kill Python Worker")
-          env.destroyPythonWorker(pythonExec, envVars.toMap)
+          env.destroyPythonWorker(pythonExec, envVars.toMap, worker)
         } catch {
           case e: Exception =>
             logError("Exception when trying to kill worker", e)
@@ -362,19 +368,17 @@ private[spark] object PythonRDD extends Logging {
       valueClassMaybeNull: String,
       keyConverterClass: String,
       valueConverterClass: String,
-      minSplits: Int) = {
+      minSplits: Int,
+      batchSize: Int) = {
     val keyClass = Option(keyClassMaybeNull).getOrElse("org.apache.hadoop.io.Text")
     val valueClass = Option(valueClassMaybeNull).getOrElse("org.apache.hadoop.io.Text")
-    implicit val kcm = ClassTag(Class.forName(keyClass)).asInstanceOf[ClassTag[K]]
-    implicit val vcm = ClassTag(Class.forName(valueClass)).asInstanceOf[ClassTag[V]]
-    val kc = kcm.runtimeClass.asInstanceOf[Class[K]]
-    val vc = vcm.runtimeClass.asInstanceOf[Class[V]]
-
+    val kc = Class.forName(keyClass).asInstanceOf[Class[K]]
+    val vc = Class.forName(valueClass).asInstanceOf[Class[V]]
     val rdd = sc.sc.sequenceFile[K, V](path, kc, vc, minSplits)
-    val keyConverter = Converter.getInstance(Option(keyConverterClass))
-    val valueConverter = Converter.getInstance(Option(valueConverterClass))
-    val converted = PythonHadoopUtil.convertRDD[K, V](rdd, keyConverter, valueConverter)
-    JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
+    val confBroadcasted = sc.sc.broadcast(new SerializableWritable(sc.hadoopConfiguration()))
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new WritableToJavaConverter(confBroadcasted, batchSize))
+    JavaRDD.fromRDD(SerDeUtil.pairRDDToPython(converted, batchSize))
   }
 
   /**
@@ -391,17 +395,16 @@ private[spark] object PythonRDD extends Logging {
       valueClass: String,
       keyConverterClass: String,
       valueConverterClass: String,
-      confAsMap: java.util.HashMap[String, String]) = {
-    val conf = PythonHadoopUtil.mapToConf(confAsMap)
-    val baseConf = sc.hadoopConfiguration()
-    val mergedConf = PythonHadoopUtil.mergeConfs(baseConf, conf)
+      confAsMap: java.util.HashMap[String, String],
+      batchSize: Int) = {
+    val mergedConf = getMergedConf(confAsMap, sc.hadoopConfiguration())
     val rdd =
       newAPIHadoopRDDFromClassNames[K, V, F](sc,
         Some(path), inputFormatClass, keyClass, valueClass, mergedConf)
-    val keyConverter = Converter.getInstance(Option(keyConverterClass))
-    val valueConverter = Converter.getInstance(Option(valueConverterClass))
-    val converted = PythonHadoopUtil.convertRDD[K, V](rdd, keyConverter, valueConverter)
-    JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
+    val confBroadcasted = sc.sc.broadcast(new SerializableWritable(mergedConf))
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new WritableToJavaConverter(confBroadcasted, batchSize))
+    JavaRDD.fromRDD(SerDeUtil.pairRDDToPython(converted, batchSize))
   }
 
   /**
@@ -418,15 +421,16 @@ private[spark] object PythonRDD extends Logging {
       valueClass: String,
       keyConverterClass: String,
       valueConverterClass: String,
-      confAsMap: java.util.HashMap[String, String]) = {
+      confAsMap: java.util.HashMap[String, String],
+      batchSize: Int) = {
     val conf = PythonHadoopUtil.mapToConf(confAsMap)
     val rdd =
       newAPIHadoopRDDFromClassNames[K, V, F](sc,
         None, inputFormatClass, keyClass, valueClass, conf)
-    val keyConverter = Converter.getInstance(Option(keyConverterClass))
-    val valueConverter = Converter.getInstance(Option(valueConverterClass))
-    val converted = PythonHadoopUtil.convertRDD[K, V](rdd, keyConverter, valueConverter)
-    JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
+    val confBroadcasted = sc.sc.broadcast(new SerializableWritable(conf))
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new WritableToJavaConverter(confBroadcasted, batchSize))
+    JavaRDD.fromRDD(SerDeUtil.pairRDDToPython(converted, batchSize))
   }
 
   private def newAPIHadoopRDDFromClassNames[K, V, F <: NewInputFormat[K, V]](
@@ -436,18 +440,14 @@ private[spark] object PythonRDD extends Logging {
       keyClass: String,
       valueClass: String,
       conf: Configuration) = {
-    implicit val kcm = ClassTag(Class.forName(keyClass)).asInstanceOf[ClassTag[K]]
-    implicit val vcm = ClassTag(Class.forName(valueClass)).asInstanceOf[ClassTag[V]]
-    implicit val fcm = ClassTag(Class.forName(inputFormatClass)).asInstanceOf[ClassTag[F]]
-    val kc = kcm.runtimeClass.asInstanceOf[Class[K]]
-    val vc = vcm.runtimeClass.asInstanceOf[Class[V]]
-    val fc = fcm.runtimeClass.asInstanceOf[Class[F]]
-    val rdd = if (path.isDefined) {
+    val kc = Class.forName(keyClass).asInstanceOf[Class[K]]
+    val vc = Class.forName(valueClass).asInstanceOf[Class[V]]
+    val fc = Class.forName(inputFormatClass).asInstanceOf[Class[F]]
+    if (path.isDefined) {
       sc.sc.newAPIHadoopFile[K, V, F](path.get, fc, kc, vc, conf)
     } else {
       sc.sc.newAPIHadoopRDD[K, V, F](conf, fc, kc, vc)
     }
-    rdd
   }
 
   /**
@@ -464,17 +464,16 @@ private[spark] object PythonRDD extends Logging {
       valueClass: String,
       keyConverterClass: String,
       valueConverterClass: String,
-      confAsMap: java.util.HashMap[String, String]) = {
-    val conf = PythonHadoopUtil.mapToConf(confAsMap)
-    val baseConf = sc.hadoopConfiguration()
-    val mergedConf = PythonHadoopUtil.mergeConfs(baseConf, conf)
+      confAsMap: java.util.HashMap[String, String],
+      batchSize: Int) = {
+    val mergedConf = getMergedConf(confAsMap, sc.hadoopConfiguration())
     val rdd =
       hadoopRDDFromClassNames[K, V, F](sc,
         Some(path), inputFormatClass, keyClass, valueClass, mergedConf)
-    val keyConverter = Converter.getInstance(Option(keyConverterClass))
-    val valueConverter = Converter.getInstance(Option(valueConverterClass))
-    val converted = PythonHadoopUtil.convertRDD[K, V](rdd, keyConverter, valueConverter)
-    JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
+    val confBroadcasted = sc.sc.broadcast(new SerializableWritable(mergedConf))
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new WritableToJavaConverter(confBroadcasted, batchSize))
+    JavaRDD.fromRDD(SerDeUtil.pairRDDToPython(converted, batchSize))
   }
 
   /**
@@ -491,15 +490,16 @@ private[spark] object PythonRDD extends Logging {
       valueClass: String,
       keyConverterClass: String,
       valueConverterClass: String,
-      confAsMap: java.util.HashMap[String, String]) = {
+      confAsMap: java.util.HashMap[String, String],
+      batchSize: Int) = {
     val conf = PythonHadoopUtil.mapToConf(confAsMap)
     val rdd =
       hadoopRDDFromClassNames[K, V, F](sc,
         None, inputFormatClass, keyClass, valueClass, conf)
-    val keyConverter = Converter.getInstance(Option(keyConverterClass))
-    val valueConverter = Converter.getInstance(Option(valueConverterClass))
-    val converted = PythonHadoopUtil.convertRDD[K, V](rdd, keyConverter, valueConverter)
-    JavaRDD.fromRDD(SerDeUtil.rddToPython(converted))
+    val confBroadcasted = sc.sc.broadcast(new SerializableWritable(conf))
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new WritableToJavaConverter(confBroadcasted, batchSize))
+    JavaRDD.fromRDD(SerDeUtil.pairRDDToPython(converted, batchSize))
   }
 
   private def hadoopRDDFromClassNames[K, V, F <: InputFormat[K, V]](
@@ -509,18 +509,14 @@ private[spark] object PythonRDD extends Logging {
       keyClass: String,
       valueClass: String,
       conf: Configuration) = {
-    implicit val kcm = ClassTag(Class.forName(keyClass)).asInstanceOf[ClassTag[K]]
-    implicit val vcm = ClassTag(Class.forName(valueClass)).asInstanceOf[ClassTag[V]]
-    implicit val fcm = ClassTag(Class.forName(inputFormatClass)).asInstanceOf[ClassTag[F]]
-    val kc = kcm.runtimeClass.asInstanceOf[Class[K]]
-    val vc = vcm.runtimeClass.asInstanceOf[Class[V]]
-    val fc = fcm.runtimeClass.asInstanceOf[Class[F]]
-    val rdd = if (path.isDefined) {
+    val kc = Class.forName(keyClass).asInstanceOf[Class[K]]
+    val vc = Class.forName(valueClass).asInstanceOf[Class[V]]
+    val fc = Class.forName(inputFormatClass).asInstanceOf[Class[F]]
+    if (path.isDefined) {
       sc.sc.hadoopFile(path.get, fc, kc, vc)
     } else {
       sc.sc.hadoopRDD(new JobConf(conf), fc, kc, vc)
     }
-    rdd
   }
 
   def writeUTF(str: String, dataOut: DataOutputStream) {
@@ -540,24 +536,199 @@ private[spark] object PythonRDD extends Logging {
     file.close()
   }
 
+  private def getMergedConf(confAsMap: java.util.HashMap[String, String],
+      baseConf: Configuration): Configuration = {
+    val conf = PythonHadoopUtil.mapToConf(confAsMap)
+    PythonHadoopUtil.mergeConfs(baseConf, conf)
+  }
+
+  private def inferKeyValueTypes[K, V](rdd: RDD[(K, V)], keyConverterClass: String = null,
+      valueConverterClass: String = null): (Class[_], Class[_]) = {
+    // Peek at an element to figure out key/value types. Since Writables are not serializable,
+    // we cannot call first() on the converted RDD. Instead, we call first() on the original RDD
+    // and then convert locally.
+    val (key, value) = rdd.first()
+    val (kc, vc) = getKeyValueConverters(keyConverterClass, valueConverterClass,
+      new JavaToWritableConverter)
+    (kc.convert(key).getClass, vc.convert(value).getClass)
+  }
+
+  private def getKeyValueTypes(keyClass: String, valueClass: String):
+      Option[(Class[_], Class[_])] = {
+    for {
+      k <- Option(keyClass)
+      v <- Option(valueClass)
+    } yield (Class.forName(k), Class.forName(v))
+  }
+
+  private def getKeyValueConverters(keyConverterClass: String, valueConverterClass: String,
+      defaultConverter: Converter[Any, Any]): (Converter[Any, Any], Converter[Any, Any]) = {
+    val keyConverter = Converter.getInstance(Option(keyConverterClass), defaultConverter)
+    val valueConverter = Converter.getInstance(Option(valueConverterClass), defaultConverter)
+    (keyConverter, valueConverter)
+  }
+
+  /**
+   * Convert an RDD of key-value pairs from internal types to serializable types suitable for
+   * output, or vice versa.
+   */
+  private def convertRDD[K, V](rdd: RDD[(K, V)],
+                               keyConverterClass: String,
+                               valueConverterClass: String,
+                               defaultConverter: Converter[Any, Any]): RDD[(Any, Any)] = {
+    val (kc, vc) = getKeyValueConverters(keyConverterClass, valueConverterClass,
+      defaultConverter)
+    PythonHadoopUtil.convertRDD(rdd, kc, vc)
+  }
+
+  /**
+   * Output a Python RDD of key-value pairs as a Hadoop SequenceFile using the Writable types
+   * we convert from the RDD's key and value types. Note that keys and values can't be
+   * [[org.apache.hadoop.io.Writable]] types already, since Writables are not Java
+   * `Serializable` and we can't peek at them. The `path` can be on any Hadoop file system.
+   */
+  def saveAsSequenceFile[K, V, C <: CompressionCodec](
+      pyRDD: JavaRDD[Array[Byte]],
+      batchSerialized: Boolean,
+      path: String,
+      compressionCodecClass: String) = {
+    saveAsHadoopFile(
+      pyRDD, batchSerialized, path, "org.apache.hadoop.mapred.SequenceFileOutputFormat",
+      null, null, null, null, new java.util.HashMap(), compressionCodecClass)
+  }
+
+  /**
+   * Output a Python RDD of key-value pairs to any Hadoop file system, using old Hadoop
+   * `OutputFormat` in mapred package. Keys and values are converted to suitable output
+   * types using either user specified converters or, if not specified,
+   * [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion types
+   * `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
+   * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
+   * this RDD.
+   */
+  def saveAsHadoopFile[K, V, F <: OutputFormat[_, _], C <: CompressionCodec](
+      pyRDD: JavaRDD[Array[Byte]],
+      batchSerialized: Boolean,
+      path: String,
+      outputFormatClass: String,
+      keyClass: String,
+      valueClass: String,
+      keyConverterClass: String,
+      valueConverterClass: String,
+      confAsMap: java.util.HashMap[String, String],
+      compressionCodecClass: String) = {
+    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
+    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
+      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
+    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
+    val codec = Option(compressionCodecClass).map(Class.forName(_).asInstanceOf[Class[C]])
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new JavaToWritableConverter)
+    val fc = Class.forName(outputFormatClass).asInstanceOf[Class[F]]
+    converted.saveAsHadoopFile(path, kc, vc, fc, new JobConf(mergedConf), codec=codec)
+  }
+
+  /**
+   * Output a Python RDD of key-value pairs to any Hadoop file system, using new Hadoop
+   * `OutputFormat` in mapreduce package. Keys and values are converted to suitable output
+   * types using either user specified converters or, if not specified,
+   * [[org.apache.spark.api.python.JavaToWritableConverter]]. Post-conversion types
+   * `keyClass` and `valueClass` are automatically inferred if not specified. The passed-in
+   * `confAsMap` is merged with the default Hadoop conf associated with the SparkContext of
+   * this RDD.
+   */
+  def saveAsNewAPIHadoopFile[K, V, F <: NewOutputFormat[_, _]](
+      pyRDD: JavaRDD[Array[Byte]],
+      batchSerialized: Boolean,
+      path: String,
+      outputFormatClass: String,
+      keyClass: String,
+      valueClass: String,
+      keyConverterClass: String,
+      valueConverterClass: String,
+      confAsMap: java.util.HashMap[String, String]) = {
+    val rdd = SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized)
+    val (kc, vc) = getKeyValueTypes(keyClass, valueClass).getOrElse(
+      inferKeyValueTypes(rdd, keyConverterClass, valueConverterClass))
+    val mergedConf = getMergedConf(confAsMap, pyRDD.context.hadoopConfiguration)
+    val converted = convertRDD(rdd, keyConverterClass, valueConverterClass,
+      new JavaToWritableConverter)
+    val fc = Class.forName(outputFormatClass).asInstanceOf[Class[F]]
+    converted.saveAsNewAPIHadoopFile(path, kc, vc, fc, mergedConf)
+  }
+
   /**
-   * Convert an RDD of serialized Python dictionaries to Scala Maps
-   * TODO: Support more Python types.
+   * Output a Python RDD of key-value pairs to any Hadoop file system, using a Hadoop conf
+   * converted from the passed-in `confAsMap`. The conf should set relevant output params (
+   * e.g., output path, output format, etc), in the same way as it would be configured for
+   * a Hadoop MapReduce job. Both old and new Hadoop OutputFormat APIs are supported
+   * (mapred vs. mapreduce). Keys/values are converted for output using either user specified
+   * converters or, by default, [[org.apache.spark.api.python.JavaToWritableConverter]].
    */
+  def saveAsHadoopDataset[K, V](
+      pyRDD: JavaRDD[Array[Byte]],
+      batchSerialized: Boolean,
+      confAsMap: java.util.HashMap[String, String],
+      keyConverterClass: String,
+      valueConverterClass: String,
+      useNewAPI: Boolean) = {
+    val conf = PythonHadoopUtil.mapToConf(confAsMap)
+    val converted = convertRDD(SerDeUtil.pythonToPairRDD(pyRDD, batchSerialized),
+      keyConverterClass, valueConverterClass, new JavaToWritableConverter)
+    if (useNewAPI) {
+      converted.saveAsNewAPIHadoopDataset(conf)
+    } else {
+      converted.saveAsHadoopDataset(new JobConf(conf))
+    }
+  }
+
+
+  /**
+   * Convert an RDD of serialized Python dictionaries to Scala Maps (no recursive conversions).
+   */
+  @deprecated("PySpark does not use it anymore", "1.1")
   def pythonToJavaMap(pyRDD: JavaRDD[Array[Byte]]): JavaRDD[Map[String, _]] = {
     pyRDD.rdd.mapPartitions { iter =>
       val unpickle = new Unpickler
-      // TODO: Figure out why flatMap is necessay for pyspark
       iter.flatMap { row =>
         unpickle.loads(row) match {
-          case objs: java.util.ArrayList[JMap[String, _] @unchecked] => objs.map(_.toMap)
-          // Incase the partition doesn't have a collection
+          // in case of objects are pickled in batch mode
+          case objs: JArrayList[JMap[String, _] @unchecked] => objs.map(_.toMap)
+          // not in batch mode
           case obj: JMap[String @unchecked, _] => Seq(obj.toMap)
         }
       }
     }
   }
 
+  /**
+   * Convert an RDD of serialized Python tuple to Array (no recursive conversions).
+   * It is only used by pyspark.sql.
+   */
+  def pythonToJavaArray(pyRDD: JavaRDD[Array[Byte]], batched: Boolean): JavaRDD[Array[_]] = {
+
+    def toArray(obj: Any): Array[_] = {
+      obj match {
+        case objs: JArrayList[_] =>
+          objs.toArray
+        case obj if obj.getClass.isArray =>
+          obj.asInstanceOf[Array[_]].toArray
+      }
+    }
+
+    pyRDD.rdd.mapPartitions { iter =>
+      val unpickle = new Unpickler
+      iter.flatMap { row =>
+        val obj = unpickle.loads(row)
+        if (batched) {
+          obj.asInstanceOf[JArrayList[_]].map(toArray)
+        } else {
+          Seq(toArray(obj))
+        }
+      }
+    }.toJavaRDD()
+  }
+
   /**
    * Convert and RDD of Java objects to and RDD of serialized Python objects, that is usable by
    * PySpark.
@@ -588,19 +759,30 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
 
   val bufferSize = SparkEnv.get.conf.getInt("spark.buffer.size", 65536)
 
+  /** 
+   * We try to reuse a single Socket to transfer accumulator updates, as they are all added
+   * by the DAGScheduler's single-threaded actor anyway.
+   */ 
+  @transient var socket: Socket = _
+
+  def openSocket(): Socket = synchronized {
+    if (socket == null || socket.isClosed) {
+      socket = new Socket(serverHost, serverPort)
+    }
+    socket
+  }
+
   override def zero(value: JList[Array[Byte]]): JList[Array[Byte]] = new JArrayList
 
   override def addInPlace(val1: JList[Array[Byte]], val2: JList[Array[Byte]])
-      : JList[Array[Byte]] = {
+      : JList[Array[Byte]] = synchronized {
     if (serverHost == null) {
       // This happens on the worker node, where we just want to remember all the updates
       val1.addAll(val2)
       val1
     } else {
       // This happens on the master, where we pass the updates to Python through a socket
-      val socket = new Socket(serverHost, serverPort)
-      // SPARK-2282: Immediately reuse closed sockets because we create one per task.
-      socket.setReuseAddress(true)
+      val socket = openSocket()
       val in = socket.getInputStream
       val out = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream, bufferSize))
       out.writeInt(val2.size)
@@ -614,7 +796,6 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
       if (byteRead == -1) {
         throw new SparkException("EOF reached before Python server acknowledged")
       }
-      socket.close()
       null
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 6d3e257c4d5df..52c70712eea3d 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -29,7 +29,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.1-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.2.1-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
index 759cbe2c46c52..7af260d0b7f26 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.api.python
 
-import java.io.{DataInputStream, InputStream, OutputStreamWriter}
+import java.lang.Runtime
+import java.io.{DataOutputStream, DataInputStream, InputStream, OutputStreamWriter}
 import java.net.{InetAddress, ServerSocket, Socket, SocketException}
 
+import scala.collection.mutable
 import scala.collection.JavaConversions._
 
 import org.apache.spark._
@@ -39,6 +41,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
   var daemon: Process = null
   val daemonHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
   var daemonPort: Int = 0
+  var daemonWorkers = new mutable.WeakHashMap[Socket, Int]()
+
+  var simpleWorkers = new mutable.WeakHashMap[Socket, Process]()
 
   val pythonPath = PythonUtils.mergePythonPaths(
     PythonUtils.sparkPythonPath,
@@ -58,19 +63,31 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
    * to avoid the high cost of forking from Java. This currently only works on UNIX-based systems.
    */
   private def createThroughDaemon(): Socket = {
+
+    def createSocket(): Socket = {
+      val socket = new Socket(daemonHost, daemonPort)
+      val pid = new DataInputStream(socket.getInputStream).readInt()
+      if (pid < 0) {
+        throw new IllegalStateException("Python daemon failed to launch worker")
+      }
+      daemonWorkers.put(socket, pid)
+      socket
+    }
+
     synchronized {
       // Start the daemon if it hasn't been started
       startDaemon()
 
       // Attempt to connect, restart and retry once if it fails
       try {
-        new Socket(daemonHost, daemonPort)
+        createSocket()
       } catch {
         case exc: SocketException =>
-          logWarning("Python daemon unexpectedly quit, attempting to restart")
+          logWarning("Failed to open socket to Python daemon:", exc)
+          logWarning("Assuming that daemon unexpectedly quit, attempting to restart")
           stopDaemon()
           startDaemon()
-          new Socket(daemonHost, daemonPort)
+          createSocket()
       }
     }
   }
@@ -101,7 +118,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       // Wait for it to connect to our socket
       serverSocket.setSoTimeout(10000)
       try {
-        return serverSocket.accept()
+        val socket = serverSocket.accept()
+        simpleWorkers.put(socket, worker)
+        return socket
       } catch {
         case e: Exception =>
           throw new SparkException("Python worker did not connect back in time", e)
@@ -183,19 +202,40 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
   private def stopDaemon() {
     synchronized {
-      // Request shutdown of existing daemon by sending SIGTERM
-      if (daemon != null) {
-        daemon.destroy()
+      if (useDaemon) {
+        // Request shutdown of existing daemon by sending SIGTERM
+        if (daemon != null) {
+          daemon.destroy()
+        }
+
+        daemon = null
+        daemonPort = 0
+      } else {
+        simpleWorkers.mapValues(_.destroy())
       }
-
-      daemon = null
-      daemonPort = 0
     }
   }
 
   def stop() {
     stopDaemon()
   }
+
+  def stopWorker(worker: Socket) {
+    if (useDaemon) {
+      if (daemon != null) {
+        daemonWorkers.get(worker).foreach { pid =>
+          // tell daemon to kill worker by pid
+          val output = new DataOutputStream(daemon.getOutputStream)
+          output.writeInt(pid)
+          output.flush()
+          daemon.getOutputStream.flush()
+        }
+      }
+    } else {
+      simpleWorkers.get(worker).foreach(_.destroy())
+    }
+    worker.close()
+  }
 }
 
 private object PythonWorkerFactory {
diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 9a012e7254901..efc9009c088a8 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.api.python
 
-import scala.util.Try
-import org.apache.spark.rdd.RDD
-import org.apache.spark.Logging
-import scala.util.Success
+import scala.collection.JavaConversions._
 import scala.util.Failure
-import net.razorvine.pickle.Pickler
+import scala.util.Try
 
+import net.razorvine.pickle.{Unpickler, Pickler}
+
+import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.rdd.RDD
 
 /** Utilities for serialization / deserialization between Python and Java, using Pickle. */
 private[python] object SerDeUtil extends Logging {
@@ -65,20 +66,52 @@ private[python] object SerDeUtil extends Logging {
    * by PySpark. By default, if serialization fails, toString is called and the string
    * representation is serialized
    */
-  def rddToPython(rdd: RDD[(Any, Any)]): RDD[Array[Byte]] = {
+  def pairRDDToPython(rdd: RDD[(Any, Any)], batchSize: Int): RDD[Array[Byte]] = {
     val (keyFailed, valueFailed) = checkPickle(rdd.first())
     rdd.mapPartitions { iter =>
       val pickle = new Pickler
-      iter.map { case (k, v) =>
-        if (keyFailed && valueFailed) {
-          pickle.dumps(Array(k.toString, v.toString))
-        } else if (keyFailed) {
-          pickle.dumps(Array(k.toString, v))
-        } else if (!keyFailed && valueFailed) {
-          pickle.dumps(Array(k, v.toString))
+      val cleaned = iter.map { case (k, v) =>
+        val key = if (keyFailed) k.toString else k
+        val value = if (valueFailed) v.toString else v
+        Array[Any](key, value)
+      }
+      if (batchSize > 1) {
+        cleaned.grouped(batchSize).map(batched => pickle.dumps(seqAsJavaList(batched)))
+      } else {
+        cleaned.map(pickle.dumps(_))
+      }
+    }
+  }
+
+  /**
+   * Convert an RDD of serialized Python tuple (K, V) to RDD[(K, V)].
+   */
+  def pythonToPairRDD[K, V](pyRDD: RDD[Array[Byte]], batchSerialized: Boolean): RDD[(K, V)] = {
+    def isPair(obj: Any): Boolean = {
+      Option(obj.getClass.getComponentType).map(!_.isPrimitive).getOrElse(false) &&
+        obj.asInstanceOf[Array[_]].length == 2
+    }
+    pyRDD.mapPartitions { iter =>
+      val unpickle = new Unpickler
+      val unpickled =
+        if (batchSerialized) {
+          iter.flatMap { batch =>
+            unpickle.loads(batch) match {
+              case objs: java.util.List[_] => collectionAsScalaIterable(objs)
+              case other => throw new SparkException(
+                s"Unexpected type ${other.getClass.getName} for batch serialized Python RDD")
+            }
+          }
         } else {
-          pickle.dumps(Array(k, v))
+          iter.map(unpickle.loads(_))
         }
+      unpickled.map {
+        case obj if isPair(obj) =>
+          // we only accept (K, V)
+          val arr = obj.asInstanceOf[Array[_]]
+          (arr.head.asInstanceOf[K], arr.last.asInstanceOf[V])
+        case other => throw new SparkException(
+          s"RDD element of type ${other.getClass.getName} cannot be used")
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
index f0e3fb9aff5a0..d11db978b842e 100644
--- a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala
@@ -17,15 +17,16 @@
 
 package org.apache.spark.api.python
 
-import org.apache.spark.SparkContext
-import org.apache.hadoop.io._
-import scala.Array
 import java.io.{DataOutput, DataInput}
+import java.nio.charset.Charset
+
+import org.apache.hadoop.io._
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
 import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.{SparkContext, SparkException}
 
 /**
- * A class to test MsgPack serialization on the Scala side, that will be deserialized
+ * A class to test Pyrolite serialization on the Scala side, that will be deserialized
  * in Python
  * @param str
  * @param int
@@ -54,7 +55,13 @@ case class TestWritable(var str: String, var int: Int, var double: Double) exten
   }
 }
 
-class TestConverter extends Converter[Any, Any] {
+private[python] class TestInputKeyConverter extends Converter[Any, Any] {
+  override def convert(obj: Any) = {
+    obj.asInstanceOf[IntWritable].get().toChar
+  }
+}
+
+private[python] class TestInputValueConverter extends Converter[Any, Any] {
   import collection.JavaConversions._
   override def convert(obj: Any) = {
     val m = obj.asInstanceOf[MapWritable]
@@ -62,6 +69,38 @@ class TestConverter extends Converter[Any, Any] {
   }
 }
 
+private[python] class TestOutputKeyConverter extends Converter[Any, Any] {
+  override def convert(obj: Any) = {
+    new Text(obj.asInstanceOf[Int].toString)
+  }
+}
+
+private[python] class TestOutputValueConverter extends Converter[Any, Any] {
+  import collection.JavaConversions._
+  override def convert(obj: Any) = {
+    new DoubleWritable(obj.asInstanceOf[java.util.Map[Double, _]].keySet().head)
+  }
+}
+
+private[python] class DoubleArrayWritable extends ArrayWritable(classOf[DoubleWritable])
+
+private[python] class DoubleArrayToWritableConverter extends Converter[Any, Writable] {
+  override def convert(obj: Any) = obj match {
+    case arr if arr.getClass.isArray && arr.getClass.getComponentType == classOf[Double] =>
+      val daw = new DoubleArrayWritable
+      daw.set(arr.asInstanceOf[Array[Double]].map(new DoubleWritable(_)))
+      daw
+    case other => throw new SparkException(s"Data of type $other is not supported")
+  }
+}
+
+private[python] class WritableToDoubleArrayConverter extends Converter[Any, Array[Double]] {
+  override def convert(obj: Any): Array[Double] = obj match {
+    case daw : DoubleArrayWritable => daw.get().map(_.asInstanceOf[DoubleWritable].get())
+    case other => throw new SparkException(s"Data of type $other is not supported")
+  }
+}
+
 /**
  * This object contains method to generate SequenceFile test data and write it to a
  * given directory (probably a temp directory)
@@ -97,7 +136,8 @@ object WriteInputFormatTestDataGenerator {
     sc.parallelize(intKeys).saveAsSequenceFile(intPath)
     sc.parallelize(intKeys.map{ case (k, v) => (k.toDouble, v) }).saveAsSequenceFile(doublePath)
     sc.parallelize(intKeys.map{ case (k, v) => (k.toString, v) }).saveAsSequenceFile(textPath)
-    sc.parallelize(intKeys.map{ case (k, v) => (k, v.getBytes) }).saveAsSequenceFile(bytesPath)
+    sc.parallelize(intKeys.map{ case (k, v) => (k, v.getBytes(Charset.forName("UTF-8"))) }
+      ).saveAsSequenceFile(bytesPath)
     val bools = Seq((1, true), (2, true), (2, false), (3, true), (2, false), (1, false))
     sc.parallelize(bools).saveAsSequenceFile(boolPath)
     sc.parallelize(intKeys).map{ case (k, v) =>
@@ -106,19 +146,20 @@ object WriteInputFormatTestDataGenerator {
 
     // Create test data for ArrayWritable
     val data = Seq(
-      (1, Array(1.0, 2.0, 3.0)),
+      (1, Array()),
       (2, Array(3.0, 4.0, 5.0)),
       (3, Array(4.0, 5.0, 6.0))
     )
     sc.parallelize(data, numSlices = 2)
       .map{ case (k, v) =>
-      (new IntWritable(k), new ArrayWritable(classOf[DoubleWritable], v.map(new DoubleWritable(_))))
-    }.saveAsNewAPIHadoopFile[SequenceFileOutputFormat[IntWritable, ArrayWritable]](arrPath)
+        val va = new DoubleArrayWritable
+        va.set(v.map(new DoubleWritable(_)))
+        (new IntWritable(k), va)
+    }.saveAsNewAPIHadoopFile[SequenceFileOutputFormat[IntWritable, DoubleArrayWritable]](arrPath)
 
     // Create test data for MapWritable, with keys DoubleWritable and values Text
     val mapData = Seq(
-      (1, Map(2.0 -> "aa")),
-      (2, Map(3.0 -> "bb")),
+      (1, Map()),
       (2, Map(1.0 -> "cc")),
       (3, Map(2.0 -> "dd")),
       (2, Map(1.0 -> "aa")),
@@ -126,9 +167,9 @@ object WriteInputFormatTestDataGenerator {
     )
     sc.parallelize(mapData, numSlices = 2).map{ case (i, m) =>
       val mw = new MapWritable()
-      val k = m.keys.head
-      val v = m.values.head
-      mw.put(new DoubleWritable(k), new Text(v))
+      m.foreach { case (k, v) =>
+        mw.put(new DoubleWritable(k), new Text(v))
+      }
       (new IntWritable(i), mw)
     }.saveAsSequenceFile(mapPath)
 
diff --git a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
index 76956f6a345d1..15fd30e65761d 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
@@ -106,23 +106,23 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable {
    * Actually get the broadcasted value. Concrete implementations of Broadcast class must
    * define their own way to get the value.
    */
-  private[spark] def getValue(): T
+  protected def getValue(): T
 
   /**
    * Actually unpersist the broadcasted value on the executors. Concrete implementations of
    * Broadcast class must define their own logic to unpersist their own data.
    */
-  private[spark] def doUnpersist(blocking: Boolean)
+  protected def doUnpersist(blocking: Boolean)
 
   /**
    * Actually destroy all data and metadata related to this broadcast variable.
    * Implementation of Broadcast class must define their own logic to destroy their own
    * state.
    */
-  private[spark] def doDestroy(blocking: Boolean)
+  protected def doDestroy(blocking: Boolean)
 
   /** Check if this broadcast is valid. If not valid, exception is thrown. */
-  private[spark] def assertValid() {
+  protected def assertValid() {
     if (!_isValid) {
       throw new SparkException("Attempted to use %s after it has been destroyed!".format(toString))
     }
diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
index c88be6aba6901..8f8a0b11f9f2e 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
@@ -39,7 +39,7 @@ private[spark] class BroadcastManager(
     synchronized {
       if (!initialized) {
         val broadcastFactoryClass =
-          conf.get("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory")
+          conf.get("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")
 
         broadcastFactory =
           Class.forName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]
diff --git a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
index 4f6cabaff2b99..487456467b23b 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala
@@ -40,9 +40,9 @@ private[spark] class HttpBroadcast[T: ClassTag](
     @transient var value_ : T, isLocal: Boolean, id: Long)
   extends Broadcast[T](id) with Logging with Serializable {
 
-  def getValue = value_
+  override protected def getValue() = value_
 
-  val blockId = BroadcastBlockId(id)
+  private val blockId = BroadcastBlockId(id)
 
   /*
    * Broadcasted data is also stored in the BlockManager of the driver. The BlockManagerMaster
@@ -60,14 +60,14 @@ private[spark] class HttpBroadcast[T: ClassTag](
   /**
    * Remove all persisted state associated with this HTTP broadcast on the executors.
    */
-  def doUnpersist(blocking: Boolean) {
+  override protected def doUnpersist(blocking: Boolean) {
     HttpBroadcast.unpersist(id, removeFromDriver = false, blocking)
   }
 
   /**
    * Remove all persisted state associated with this HTTP broadcast on the executors and driver.
    */
-  def doDestroy(blocking: Boolean) {
+  override protected def doDestroy(blocking: Boolean) {
     HttpBroadcast.unpersist(id, removeFromDriver = true, blocking)
   }
 
@@ -102,7 +102,7 @@ private[spark] class HttpBroadcast[T: ClassTag](
   }
 }
 
-private[spark] object HttpBroadcast extends Logging {
+private[broadcast] object HttpBroadcast extends Logging {
   private var initialized = false
   private var broadcastDir: File = null
   private var compress: Boolean = false
@@ -160,7 +160,7 @@ private[spark] object HttpBroadcast extends Logging {
 
   def getFile(id: Long) = new File(broadcastDir, BroadcastBlockId(id).name)
 
-  def write(id: Long, value: Any) {
+  private def write(id: Long, value: Any) {
     val file = getFile(id)
     val out: OutputStream = {
       if (compress) {
@@ -176,7 +176,7 @@ private[spark] object HttpBroadcast extends Logging {
     files += file
   }
 
-  def read[T: ClassTag](id: Long): T = {
+  private def read[T: ClassTag](id: Long): T = {
     logDebug("broadcast read server: " +  serverUri + " id: broadcast-" + id)
     val url = serverUri + "/" + BroadcastBlockId(id).name
 
diff --git a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcastFactory.scala b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcastFactory.scala
index d5a031e2bbb59..c7ef02d572a19 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcastFactory.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/HttpBroadcastFactory.scala
@@ -27,21 +27,21 @@ import org.apache.spark.{SecurityManager, SparkConf}
  * [[org.apache.spark.broadcast.HttpBroadcast]] for more details about this mechanism.
  */
 class HttpBroadcastFactory extends BroadcastFactory {
-  def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager) {
+  override def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager) {
     HttpBroadcast.initialize(isDriver, conf, securityMgr)
   }
 
-  def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean, id: Long) =
+  override def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean, id: Long) =
     new HttpBroadcast[T](value_, isLocal, id)
 
-  def stop() { HttpBroadcast.stop() }
+  override def stop() { HttpBroadcast.stop() }
 
   /**
    * Remove all persisted state associated with the HTTP broadcast with the given ID.
    * @param removeFromDriver Whether to remove state from the driver
    * @param blocking Whether to block until unbroadcasted
    */
-  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {
+  override def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {
     HttpBroadcast.unpersist(id, removeFromDriver, blocking)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 734de37ba115d..86731b684f441 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -20,7 +20,6 @@ package org.apache.spark.broadcast
 import java.io.{ByteArrayInputStream, ObjectInputStream, ObjectOutputStream}
 
 import scala.reflect.ClassTag
-import scala.math
 import scala.util.Random
 
 import org.apache.spark.{Logging, SparkConf, SparkEnv, SparkException}
@@ -49,19 +48,19 @@ private[spark] class TorrentBroadcast[T: ClassTag](
     @transient var value_ : T, isLocal: Boolean, id: Long)
   extends Broadcast[T](id) with Logging with Serializable {
 
-  def getValue = value_
+  override protected def getValue() = value_
 
-  val broadcastId = BroadcastBlockId(id)
+  private val broadcastId = BroadcastBlockId(id)
 
   TorrentBroadcast.synchronized {
     SparkEnv.get.blockManager.putSingle(
       broadcastId, value_, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
   }
 
-  @transient var arrayOfBlocks: Array[TorrentBlock] = null
-  @transient var totalBlocks = -1
-  @transient var totalBytes = -1
-  @transient var hasBlocks = 0
+  @transient private var arrayOfBlocks: Array[TorrentBlock] = null
+  @transient private var totalBlocks = -1
+  @transient private var totalBytes = -1
+  @transient private var hasBlocks = 0
 
   if (!isLocal) {
     sendBroadcast()
@@ -70,7 +69,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](
   /**
    * Remove all persisted state associated with this Torrent broadcast on the executors.
    */
-  def doUnpersist(blocking: Boolean) {
+  override protected def doUnpersist(blocking: Boolean) {
     TorrentBroadcast.unpersist(id, removeFromDriver = false, blocking)
   }
 
@@ -78,11 +77,11 @@ private[spark] class TorrentBroadcast[T: ClassTag](
    * Remove all persisted state associated with this Torrent broadcast on the executors
    * and driver.
    */
-  def doDestroy(blocking: Boolean) {
+  override protected def doDestroy(blocking: Boolean) {
     TorrentBroadcast.unpersist(id, removeFromDriver = true, blocking)
   }
 
-  def sendBroadcast() {
+  private def sendBroadcast() {
     val tInfo = TorrentBroadcast.blockifyObject(value_)
     totalBlocks = tInfo.totalBlocks
     totalBytes = tInfo.totalBytes
@@ -159,7 +158,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](
     hasBlocks = 0
   }
 
-  def receiveBroadcast(): Boolean = {
+  private def receiveBroadcast(): Boolean = {
     // Receive meta-info about the size of broadcast data,
     // the number of chunks it is divided into, etc.
     val metaId = BroadcastBlockId(id, "meta")
@@ -211,7 +210,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](
 
 }
 
-private[spark] object TorrentBroadcast extends Logging {
+private[broadcast] object TorrentBroadcast extends Logging {
   private lazy val BLOCK_SIZE = conf.getInt("spark.broadcast.blockSize", 4096) * 1024
   private var initialized = false
   private var conf: SparkConf = null
@@ -272,17 +271,19 @@ private[spark] object TorrentBroadcast extends Logging {
    * Remove all persisted blocks associated with this torrent broadcast on the executors.
    * If removeFromDriver is true, also remove these persisted blocks on the driver.
    */
-  def unpersist(id: Long, removeFromDriver: Boolean, blocking: Boolean) = synchronized {
-    SparkEnv.get.blockManager.master.removeBroadcast(id, removeFromDriver, blocking)
+  def unpersist(id: Long, removeFromDriver: Boolean, blocking: Boolean) = {
+    synchronized {
+      SparkEnv.get.blockManager.master.removeBroadcast(id, removeFromDriver, blocking)
+    }
   }
 }
 
-private[spark] case class TorrentBlock(
+private[broadcast] case class TorrentBlock(
     blockID: Int,
     byteArray: Array[Byte])
   extends Serializable
 
-private[spark] case class TorrentInfo(
+private[broadcast] case class TorrentInfo(
     @transient arrayOfBlocks: Array[TorrentBlock],
     totalBlocks: Int,
     totalBytes: Int)
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala
index 1de8396a0e17f..ad0f701d7a98f 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala
@@ -28,21 +28,21 @@ import org.apache.spark.{SecurityManager, SparkConf}
  */
 class TorrentBroadcastFactory extends BroadcastFactory {
 
-  def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager) {
+  override def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager) {
     TorrentBroadcast.initialize(isDriver, conf)
   }
 
-  def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean, id: Long) =
+  override def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean, id: Long) =
     new TorrentBroadcast[T](value_, isLocal, id)
 
-  def stop() { TorrentBroadcast.stop() }
+  override def stop() { TorrentBroadcast.stop() }
 
   /**
    * Remove all persisted state associated with the torrent broadcast with the given ID.
    * @param removeFromDriver Whether to remove state from the driver.
    * @param blocking Whether to block until unbroadcasted
    */
-  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {
+  override def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {
     TorrentBroadcast.unpersist(id, removeFromDriver, blocking)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
index 86305d2ea8a09..65a1a8fd7e929 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
@@ -22,7 +22,6 @@ private[spark] class ApplicationDescription(
     val maxCores: Option[Int],
     val memoryPerSlave: Int,
     val command: Command,
-    val sparkHome: Option[String],
     var appUiUrl: String,
     val eventLogDir: Option[String] = None)
   extends Serializable {
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index c371dc3a51c73..17c507af2652d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.deploy
 
-import scala.collection.JavaConversions._
-import scala.collection.mutable.Map
 import scala.concurrent._
 
 import akka.actor._
@@ -50,9 +48,6 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf) extends
         // TODO: We could add an env variable here and intercept it in `sc.addJar` that would
         //       truncate filesystem paths similar to what YARN does. For now, we just require
         //       people call `addJar` assuming the jar is in the same directory.
-        val env = Map[String, String]()
-        System.getenv().foreach{case (k, v) => env(k) = v}
-
         val mainClass = "org.apache.spark.deploy.worker.DriverWrapper"
 
         val classPathConf = "spark.driver.extraClassPath"
@@ -65,10 +60,13 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf) extends
           cp.split(java.io.File.pathSeparator)
         }
 
-        val javaOptionsConf = "spark.driver.extraJavaOptions"
-        val javaOpts = sys.props.get(javaOptionsConf)
+        val extraJavaOptsConf = "spark.driver.extraJavaOptions"
+        val extraJavaOpts = sys.props.get(extraJavaOptsConf)
+          .map(Utils.splitCommandString).getOrElse(Seq.empty)
+        val sparkJavaOpts = Utils.sparkJavaOpts(conf)
+        val javaOpts = sparkJavaOpts ++ extraJavaOpts
         val command = new Command(mainClass, Seq("{{WORKER_URL}}", driverArgs.mainClass) ++
-          driverArgs.driverOptions, env, classPathEntries, libraryPathEntries, javaOpts)
+          driverArgs.driverOptions, sys.env, classPathEntries, libraryPathEntries, javaOpts)
 
         val driverDescription = new DriverDescription(
           driverArgs.jarUrl,
@@ -109,6 +107,7 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf) extends
         // Exception, if present
         statusResponse.exception.map { e =>
           println(s"Exception from cluster was: $e")
+          e.printStackTrace()
           System.exit(-1)
         }
         System.exit(0)
@@ -141,8 +140,10 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf) extends
  */
 object Client {
   def main(args: Array[String]) {
-    println("WARNING: This client is deprecated and will be removed in a future version of Spark.")
-    println("Use ./bin/spark-submit with \"--master spark://host:port\"")
+    if (!sys.props.contains("SPARK_SUBMIT")) {
+      println("WARNING: This client is deprecated and will be removed in a future version of Spark")
+      println("Use ./bin/spark-submit with \"--master spark://host:port\"")
+    }
 
     val conf = new SparkConf()
     val driverArgs = new ClientArguments(args)
diff --git a/core/src/main/scala/org/apache/spark/deploy/Command.scala b/core/src/main/scala/org/apache/spark/deploy/Command.scala
index 32f3ba385084f..a2b263544c6a2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Command.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Command.scala
@@ -25,5 +25,5 @@ private[spark] case class Command(
     environment: Map[String, String],
     classPathEntries: Seq[String],
     libraryPathEntries: Seq[String],
-    extraJavaOptions: Option[String] = None) {
+    javaOpts: Seq[String]) {
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
index c4f5e294a393e..696f32a6f5730 100644
--- a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
@@ -56,7 +56,6 @@ private[spark] object JsonProtocol {
     ("cores" -> obj.maxCores) ~
     ("memoryperslave" -> obj.memoryPerSlave) ~
     ("user" -> obj.user) ~
-    ("sparkhome" -> obj.sparkHome) ~
     ("command" -> obj.command.toString)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index b050dccb6d57f..318509a67a36f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -27,25 +27,43 @@ import org.apache.spark.executor.ExecutorURLClassLoader
 import org.apache.spark.util.Utils
 
 /**
- * Scala code behind the spark-submit script.  The script handles setting up the classpath with
- * relevant Spark dependencies and provides a layer over the different cluster managers and deploy
- * modes that Spark supports.
+ * Main gateway of launching a Spark application.
+ *
+ * This program handles setting up the classpath with relevant Spark dependencies and provides
+ * a layer over the different cluster managers and deploy modes that Spark supports.
  */
 object SparkSubmit {
+
+  // Cluster managers
   private val YARN = 1
   private val STANDALONE = 2
   private val MESOS = 4
   private val LOCAL = 8
   private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | LOCAL
 
-  private var clusterManager: Int = LOCAL
+  // Deploy modes
+  private val CLIENT = 1
+  private val CLUSTER = 2
+  private val ALL_DEPLOY_MODES = CLIENT | CLUSTER
 
-  /**
-   * Special primary resource names that represent shells rather than application jars.
-   */
+  // A special jar name that indicates the class being run is inside of Spark itself, and therefore
+  // no user jar is needed.
+  private val SPARK_INTERNAL = "spark-internal"
+
+  // Special primary resource names that represent shells rather than application jars.
   private val SPARK_SHELL = "spark-shell"
   private val PYSPARK_SHELL = "pyspark-shell"
 
+  // Exposed for testing
+  private[spark] var exitFn: () => Unit = () => System.exit(-1)
+  private[spark] var printStream: PrintStream = System.err
+  private[spark] def printWarning(str: String) = printStream.println("Warning: " + str)
+  private[spark] def printErrorAndExit(str: String) = {
+    printStream.println("Error: " + str)
+    printStream.println("Run with --help for usage help or --verbose for debug output")
+    exitFn()
+  }
+
   def main(args: Array[String]) {
     val appArgs = new SparkSubmitArguments(args)
     if (appArgs.verbose) {
@@ -55,88 +73,78 @@ object SparkSubmit {
     launch(childArgs, classpath, sysProps, mainClass, appArgs.verbose)
   }
 
-  // Exposed for testing
-  private[spark] var printStream: PrintStream = System.err
-  private[spark] var exitFn: () => Unit = () => System.exit(-1)
-
-  private[spark] def printErrorAndExit(str: String) = {
-    printStream.println("Error: " + str)
-    printStream.println("Run with --help for usage help or --verbose for debug output")
-    exitFn()
-  }
-  private[spark] def printWarning(str: String) = printStream.println("Warning: " + str)
-
   /**
-   * @return a tuple containing the arguments for the child, a list of classpath
-   *         entries for the child, a list of system properties, a list of env vars
-   *         and the main class for the child
+   * @return a tuple containing
+   *           (1) the arguments for the child process,
+   *           (2) a list of classpath entries for the child,
+   *           (3) a list of system properties and env vars, and
+   *           (4) the main class for the child
    */
   private[spark] def createLaunchEnv(args: SparkSubmitArguments)
       : (ArrayBuffer[String], ArrayBuffer[String], Map[String, String], String) = {
-    if (args.master.startsWith("local")) {
-      clusterManager = LOCAL
-    } else if (args.master.startsWith("yarn")) {
-      clusterManager = YARN
-    } else if (args.master.startsWith("spark")) {
-      clusterManager = STANDALONE
-    } else if (args.master.startsWith("mesos")) {
-      clusterManager = MESOS
-    } else {
-      printErrorAndExit("Master must start with yarn, mesos, spark, or local")
-    }
 
-    // Because "yarn-cluster" and "yarn-client" encapsulate both the master
-    // and deploy mode, we have some logic to infer the master and deploy mode
-    // from each other if only one is specified, or exit early if they are at odds.
-    if (args.deployMode == null &&
-        (args.master == "yarn-standalone" || args.master == "yarn-cluster")) {
-      args.deployMode = "cluster"
-    }
-    if (args.deployMode == "cluster" && args.master == "yarn-client") {
-      printErrorAndExit("Deploy mode \"cluster\" and master \"yarn-client\" are not compatible")
-    }
-    if (args.deployMode == "client" &&
-        (args.master == "yarn-standalone" || args.master == "yarn-cluster")) {
-      printErrorAndExit("Deploy mode \"client\" and master \"" + args.master
-        + "\" are not compatible")
-    }
-    if (args.deployMode == "cluster" && args.master.startsWith("yarn")) {
-      args.master = "yarn-cluster"
-    }
-    if (args.deployMode != "cluster" && args.master.startsWith("yarn")) {
-      args.master = "yarn-client"
-    }
-
-    val deployOnCluster = Option(args.deployMode).getOrElse("client") == "cluster"
-
-    val childClasspath = new ArrayBuffer[String]()
+    // Values to return
     val childArgs = new ArrayBuffer[String]()
+    val childClasspath = new ArrayBuffer[String]()
     val sysProps = new HashMap[String, String]()
     var childMainClass = ""
 
-    val isPython = args.isPython
-    val isYarnCluster = clusterManager == YARN && deployOnCluster
+    // Set the cluster manager
+    val clusterManager: Int = args.master match {
+      case m if m.startsWith("yarn") => YARN
+      case m if m.startsWith("spark") => STANDALONE
+      case m if m.startsWith("mesos") => MESOS
+      case m if m.startsWith("local") => LOCAL
+      case _ => printErrorAndExit("Master must start with yarn, spark, mesos, or local"); -1
+    }
 
-    // For mesos, only client mode is supported
-    if (clusterManager == MESOS && deployOnCluster) {
-      printErrorAndExit("Cluster deploy mode is currently not supported for Mesos clusters.")
+    // Set the deploy mode; default is client mode
+    var deployMode: Int = args.deployMode match {
+      case "client" | null => CLIENT
+      case "cluster" => CLUSTER
+      case _ => printErrorAndExit("Deploy mode must be either client or cluster"); -1
     }
 
-    // For standalone, only client mode is supported
-    if (clusterManager == STANDALONE && deployOnCluster) {
-      printErrorAndExit("Cluster deploy mode is currently not supported for standalone clusters.")
+    // Because "yarn-cluster" and "yarn-client" encapsulate both the master
+    // and deploy mode, we have some logic to infer the master and deploy mode
+    // from each other if only one is specified, or exit early if they are at odds.
+    if (clusterManager == YARN) {
+      if (args.master == "yarn-standalone") {
+        printWarning("\"yarn-standalone\" is deprecated. Use \"yarn-cluster\" instead.")
+        args.master = "yarn-cluster"
+      }
+      (args.master, args.deployMode) match {
+        case ("yarn-cluster", null) =>
+          deployMode = CLUSTER
+        case ("yarn-cluster", "client") =>
+          printErrorAndExit("Client deploy mode is not compatible with master \"yarn-cluster\"")
+        case ("yarn-client", "cluster") =>
+          printErrorAndExit("Cluster deploy mode is not compatible with master \"yarn-client\"")
+        case (_, mode) =>
+          args.master = "yarn-" + Option(mode).getOrElse("client")
+      }
+
+      // Make sure YARN is included in our build if we're trying to use it
+      if (!Utils.classIsLoadable("org.apache.spark.deploy.yarn.Client") && !Utils.isTesting) {
+        printErrorAndExit(
+          "Could not load YARN classes. " +
+          "This copy of Spark may not have been compiled with YARN support.")
+      }
     }
 
-    // For shells, only client mode is applicable
-    if (isShell(args.primaryResource) && deployOnCluster) {
-      printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.")
+    // The following modes are not supported or applicable
+    (clusterManager, deployMode) match {
+      case (MESOS, CLUSTER) =>
+        printErrorAndExit("Cluster deploy mode is currently not supported for Mesos clusters.")
+      case (_, CLUSTER) if args.isPython =>
+        printErrorAndExit("Cluster deploy mode is currently not supported for python applications.")
+      case (_, CLUSTER) if isShell(args.primaryResource) =>
+        printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.")
+      case _ =>
     }
 
     // If we're running a python app, set the main class to our specific python runner
-    if (isPython) {
-      if (deployOnCluster) {
-        printErrorAndExit("Cluster deploy mode is currently not supported for python.")
-      }
+    if (args.isPython) {
       if (args.primaryResource == PYSPARK_SHELL) {
         args.mainClass = "py4j.GatewayServer"
         args.childArgs = ArrayBuffer("--die-on-broken-pipe", "0")
@@ -152,120 +160,123 @@ object SparkSubmit {
       sysProps("spark.submit.pyFiles") = PythonRunner.formatPaths(args.pyFiles).mkString(",")
     }
 
-    // If we're deploying into YARN, use yarn.Client as a wrapper around the user class
-    if (!deployOnCluster) {
-      childMainClass = args.mainClass
-      if (isUserJar(args.primaryResource)) {
-        childClasspath += args.primaryResource
-      }
-    } else if (clusterManager == YARN) {
-      childMainClass = "org.apache.spark.deploy.yarn.Client"
-      childArgs += ("--jar", args.primaryResource)
-      childArgs += ("--class", args.mainClass)
-    }
-
-    // Make sure YARN is included in our build if we're trying to use it
-    if (clusterManager == YARN) {
-      if (!Utils.classIsLoadable("org.apache.spark.deploy.yarn.Client") && !Utils.isTesting) {
-        printErrorAndExit("Could not load YARN classes. " +
-          "This copy of Spark may not have been compiled with YARN support.")
-      }
-    }
-
     // Special flag to avoid deprecation warnings at the client
     sysProps("SPARK_SUBMIT") = "true"
 
     // A list of rules to map each argument to system properties or command-line options in
     // each deploy mode; we iterate through these below
     val options = List[OptionAssigner](
-      OptionAssigner(args.master, ALL_CLUSTER_MGRS, false, sysProp = "spark.master"),
-      OptionAssigner(args.name, ALL_CLUSTER_MGRS, false, sysProp = "spark.app.name"),
-      OptionAssigner(args.name, YARN, true, clOption = "--name", sysProp = "spark.app.name"),
-      OptionAssigner(args.driverExtraClassPath, STANDALONE | YARN, true,
+
+      // All cluster managers
+      OptionAssigner(args.master, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.master"),
+      OptionAssigner(args.name, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.app.name"),
+      OptionAssigner(args.jars, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.jars"),
+
+      // Standalone cluster only
+      OptionAssigner(args.driverMemory, STANDALONE, CLUSTER, clOption = "--memory"),
+      OptionAssigner(args.driverCores, STANDALONE, CLUSTER, clOption = "--cores"),
+
+      // Yarn client only
+      OptionAssigner(args.queue, YARN, CLIENT, sysProp = "spark.yarn.queue"),
+      OptionAssigner(args.numExecutors, YARN, CLIENT, sysProp = "spark.executor.instances"),
+      OptionAssigner(args.executorCores, YARN, CLIENT, sysProp = "spark.executor.cores"),
+      OptionAssigner(args.files, YARN, CLIENT, sysProp = "spark.yarn.dist.files"),
+      OptionAssigner(args.archives, YARN, CLIENT, sysProp = "spark.yarn.dist.archives"),
+
+      // Yarn cluster only
+      OptionAssigner(args.name, YARN, CLUSTER, clOption = "--name"),
+      OptionAssigner(args.driverMemory, YARN, CLUSTER, clOption = "--driver-memory"),
+      OptionAssigner(args.queue, YARN, CLUSTER, clOption = "--queue"),
+      OptionAssigner(args.numExecutors, YARN, CLUSTER, clOption = "--num-executors"),
+      OptionAssigner(args.executorMemory, YARN, CLUSTER, clOption = "--executor-memory"),
+      OptionAssigner(args.executorCores, YARN, CLUSTER, clOption = "--executor-cores"),
+      OptionAssigner(args.files, YARN, CLUSTER, clOption = "--files"),
+      OptionAssigner(args.archives, YARN, CLUSTER, clOption = "--archives"),
+      OptionAssigner(args.jars, YARN, CLUSTER, clOption = "--addJars"),
+
+      // Other options
+      OptionAssigner(args.driverExtraClassPath, STANDALONE | YARN, CLUSTER,
         sysProp = "spark.driver.extraClassPath"),
-      OptionAssigner(args.driverExtraJavaOptions, STANDALONE | YARN, true,
+      OptionAssigner(args.driverExtraJavaOptions, STANDALONE | YARN, CLUSTER,
         sysProp = "spark.driver.extraJavaOptions"),
-      OptionAssigner(args.driverExtraLibraryPath, STANDALONE | YARN, true,
+      OptionAssigner(args.driverExtraLibraryPath, STANDALONE | YARN, CLUSTER,
         sysProp = "spark.driver.extraLibraryPath"),
-      OptionAssigner(args.driverMemory, YARN, true, clOption = "--driver-memory"),
-      OptionAssigner(args.driverMemory, STANDALONE, true, clOption = "--memory"),
-      OptionAssigner(args.driverCores, STANDALONE, true, clOption = "--cores"),
-      OptionAssigner(args.queue, YARN, true, clOption = "--queue"),
-      OptionAssigner(args.queue, YARN, false, sysProp = "spark.yarn.queue"),
-      OptionAssigner(args.numExecutors, YARN, true, clOption = "--num-executors"),
-      OptionAssigner(args.numExecutors, YARN, false, sysProp = "spark.executor.instances"),
-      OptionAssigner(args.executorMemory, YARN, true, clOption = "--executor-memory"),
-      OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, false,
+      OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, ALL_DEPLOY_MODES,
         sysProp = "spark.executor.memory"),
-      OptionAssigner(args.executorCores, YARN, true, clOption = "--executor-cores"),
-      OptionAssigner(args.executorCores, YARN, false, sysProp = "spark.executor.cores"),
-      OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS, false,
+      OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS, ALL_DEPLOY_MODES,
         sysProp = "spark.cores.max"),
-      OptionAssigner(args.files, YARN, false, sysProp = "spark.yarn.dist.files"),
-      OptionAssigner(args.files, YARN, true, clOption = "--files"),
-      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.files"),
-      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, true, sysProp = "spark.files"),
-      OptionAssigner(args.archives, YARN, false, sysProp = "spark.yarn.dist.archives"),
-      OptionAssigner(args.archives, YARN, true, clOption = "--archives"),
-      OptionAssigner(args.jars, YARN, true, clOption = "--addJars"),
-      OptionAssigner(args.jars, ALL_CLUSTER_MGRS, false, sysProp = "spark.jars")
+      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
+        sysProp = "spark.files")
     )
 
-    // For client mode make any added jars immediately visible on the classpath
-    if (args.jars != null && !deployOnCluster) {
-      for (jar <- args.jars.split(",")) {
-        childClasspath += jar
+    // In client mode, launch the application main class directly
+    // In addition, add the main application jar and any added jars (if any) to the classpath
+    if (deployMode == CLIENT) {
+      childMainClass = args.mainClass
+      if (isUserJar(args.primaryResource)) {
+        childClasspath += args.primaryResource
       }
+      if (args.jars != null) { childClasspath ++= args.jars.split(",") }
+      if (args.childArgs != null) { childArgs ++= args.childArgs }
     }
 
+
     // Map all arguments to command-line options or system properties for our chosen mode
     for (opt <- options) {
-      if (opt.value != null && deployOnCluster == opt.deployOnCluster &&
+      if (opt.value != null &&
+          (deployMode & opt.deployMode) != 0 &&
           (clusterManager & opt.clusterManager) != 0) {
-        if (opt.clOption != null) {
-          childArgs += (opt.clOption, opt.value)
-        }
-        if (opt.sysProp != null) {
-          sysProps.put(opt.sysProp, opt.value)
-        }
+        if (opt.clOption != null) { childArgs += (opt.clOption, opt.value) }
+        if (opt.sysProp != null) { sysProps.put(opt.sysProp, opt.value) }
       }
     }
 
     // Add the application jar automatically so the user doesn't have to call sc.addJar
     // For YARN cluster mode, the jar is already distributed on each node as "app.jar"
     // For python files, the primary resource is already distributed as a regular file
-    if (!isYarnCluster && !isPython) {
-      var jars = sysProps.get("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq())
+    val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
+    if (!isYarnCluster && !args.isPython) {
+      var jars = sysProps.get("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq.empty)
       if (isUserJar(args.primaryResource)) {
         jars = jars ++ Seq(args.primaryResource)
       }
       sysProps.put("spark.jars", jars.mkString(","))
     }
 
-    // Standalone cluster specific configurations
-    if (deployOnCluster && clusterManager == STANDALONE) {
+    // In standalone-cluster mode, use Client as a wrapper around the user class
+    if (clusterManager == STANDALONE && deployMode == CLUSTER) {
+      childMainClass = "org.apache.spark.deploy.Client"
       if (args.supervise) {
         childArgs += "--supervise"
       }
-      childMainClass = "org.apache.spark.deploy.Client"
       childArgs += "launch"
       childArgs += (args.master, args.primaryResource, args.mainClass)
+      if (args.childArgs != null) {
+        childArgs ++= args.childArgs
+      }
     }
 
-    // Arguments to be passed to user program
-    if (args.childArgs != null) {
-      if (!deployOnCluster || clusterManager == STANDALONE) {
-        childArgs ++= args.childArgs
-      } else if (clusterManager == YARN) {
-        for (arg <- args.childArgs) {
-          childArgs += ("--arg", arg)
-        }
+    // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
+    if (clusterManager == YARN && deployMode == CLUSTER) {
+      childMainClass = "org.apache.spark.deploy.yarn.Client"
+      if (args.primaryResource != SPARK_INTERNAL) {
+        childArgs += ("--jar", args.primaryResource)
+      }
+      childArgs += ("--class", args.mainClass)
+      if (args.childArgs != null) {
+        args.childArgs.foreach { arg => childArgs += ("--arg", arg) }
       }
     }
 
+    // Properties given with --conf are superceded by other options, but take precedence over
+    // properties in the defaults file.
+    for ((k, v) <- args.sparkProperties) {
+      sysProps.getOrElseUpdate(k, v)
+    }
+
     // Read from default spark properties, if any
     for ((k, v) <- args.getDefaultSparkProperties) {
-      if (!sysProps.contains(k)) sysProps(k) = v
+      sysProps.getOrElseUpdate(k, v)
     }
 
     (childArgs, childClasspath, sysProps, childMainClass)
@@ -328,7 +339,7 @@ object SparkSubmit {
    * Return whether the given primary resource represents a user jar.
    */
   private def isUserJar(primaryResource: String): Boolean = {
-    !isShell(primaryResource) && !isPython(primaryResource)
+    !isShell(primaryResource) && !isPython(primaryResource) && !isInternal(primaryResource)
   }
 
   /**
@@ -345,6 +356,10 @@ object SparkSubmit {
     primaryResource.endsWith(".py") || primaryResource == PYSPARK_SHELL
   }
 
+  private[spark] def isInternal(primaryResource: String): Boolean = {
+    primaryResource == SPARK_INTERNAL
+  }
+
   /**
    * Merge a sequence of comma-separated file lists, some of which may be null to indicate
    * no files, into a single comma-separated string.
@@ -364,6 +379,6 @@ object SparkSubmit {
 private[spark] case class OptionAssigner(
     value: String,
     clusterManager: Int,
-    deployOnCluster: Boolean,
+    deployMode: Int,
     clOption: String = null,
     sysProp: String = null)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 57655aa4c32b1..9391f24e71ed7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -55,9 +55,10 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
   var verbose: Boolean = false
   var isPython: Boolean = false
   var pyFiles: String = null
+  val sparkProperties: HashMap[String, String] = new HashMap[String, String]()
 
   parseOpts(args.toList)
-  loadDefaults()
+  mergeSparkProperties()
   checkRequiredArguments()
 
   /** Return default present in the currently defined defaults file. */
@@ -78,10 +79,23 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
     defaultProperties
   }
 
-  /** Fill in any undefined values based on the current properties file or built-in defaults. */
-  private def loadDefaults(): Unit = {
-
+  /**
+   * Fill in any undefined values based on the default properties file or options passed in through
+   * the '--conf' flag.
+   */
+  private def mergeSparkProperties(): Unit = {
     // Use common defaults file, if not specified by user
+    if (propertiesFile == null) {
+      sys.env.get("SPARK_CONF_DIR").foreach { sparkConfDir =>
+        val sep = File.separator
+        val defaultPath = s"${sparkConfDir}${sep}spark-defaults.conf"
+        val file = new File(defaultPath)
+        if (file.exists()) {
+          propertiesFile = file.getAbsolutePath
+        }
+      }
+    }
+
     if (propertiesFile == null) {
       sys.env.get("SPARK_HOME").foreach { sparkHome =>
         val sep = File.separator
@@ -93,18 +107,20 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
       }
     }
 
-    val defaultProperties = getDefaultSparkProperties
+    val properties = getDefaultSparkProperties
+    properties.putAll(sparkProperties)
+
     // Use properties file as fallback for values which have a direct analog to
     // arguments in this script.
-    master = Option(master).getOrElse(defaultProperties.get("spark.master").orNull)
+    master = Option(master).getOrElse(properties.get("spark.master").orNull)
     executorMemory = Option(executorMemory)
-      .getOrElse(defaultProperties.get("spark.executor.memory").orNull)
+      .getOrElse(properties.get("spark.executor.memory").orNull)
     executorCores = Option(executorCores)
-      .getOrElse(defaultProperties.get("spark.executor.cores").orNull)
+      .getOrElse(properties.get("spark.executor.cores").orNull)
     totalExecutorCores = Option(totalExecutorCores)
-      .getOrElse(defaultProperties.get("spark.cores.max").orNull)
-    name = Option(name).getOrElse(defaultProperties.get("spark.app.name").orNull)
-    jars = Option(jars).getOrElse(defaultProperties.get("spark.jars").orNull)
+      .getOrElse(properties.get("spark.cores.max").orNull)
+    name = Option(name).getOrElse(properties.get("spark.app.name").orNull)
+    jars = Option(jars).getOrElse(properties.get("spark.jars").orNull)
 
     // This supports env vars in older versions of Spark
     master = Option(master).getOrElse(System.getenv("MASTER"))
@@ -177,6 +193,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
     |  executorCores           $executorCores
     |  totalExecutorCores      $totalExecutorCores
     |  propertiesFile          $propertiesFile
+    |  extraSparkProperties    $sparkProperties
     |  driverMemory            $driverMemory
     |  driverCores             $driverCores
     |  driverExtraClassPath    $driverExtraClassPath
@@ -202,8 +219,9 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
 
   /** Fill in values by parsing user options. */
   private def parseOpts(opts: Seq[String]): Unit = {
-    // Delineates parsing of Spark options from parsing of user options.
     var inSparkOpts = true
+
+    // Delineates parsing of Spark options from parsing of user options.
     parse(opts)
 
     def parse(opts: Seq[String]): Unit = opts match {
@@ -290,6 +308,13 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
         jars = Utils.resolveURIs(value)
         parse(tail)
 
+      case ("--conf" | "-c") :: value :: tail =>
+        value.split("=", 2).toSeq match {
+          case Seq(k, v) => sparkProperties(k) = v
+          case _ => SparkSubmit.printErrorAndExit(s"Spark config without '=': $value")
+        }
+        parse(tail)
+
       case ("--help" | "-h") :: tail =>
         printUsageAndExit(0)
 
@@ -309,7 +334,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
               SparkSubmit.printErrorAndExit(errMessage)
             case v =>
               primaryResource =
-                if (!SparkSubmit.isShell(v)) {
+                if (!SparkSubmit.isShell(v) && !SparkSubmit.isInternal(v)) {
                   Utils.resolveURI(v).toString
                 } else {
                   v
@@ -349,6 +374,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
         |                              on the PYTHONPATH for Python apps.
         |  --files FILES               Comma-separated list of files to be placed in the working
         |                              directory of each executor.
+        |
+        |  --conf PROP=VALUE           Arbitrary Spark configuration property.
         |  --properties-file FILE      Path to a file from which to load extra properties. If not
         |                              specified, this will look for conf/spark-defaults.conf.
         |
diff --git a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
index e15a87bd38fda..88a0862b96afe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
@@ -46,11 +46,10 @@ private[spark] object TestClient {
   def main(args: Array[String]) {
     val url = args(0)
     val conf = new SparkConf
-    val (actorSystem, port) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0,
+    val (actorSystem, _) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0,
       conf = conf, securityManager = new SecurityManager(conf))
-    val desc = new ApplicationDescription(
-      "TestClient", Some(1), 512, Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(),
-        Seq()), Some("dummy-spark-home"), "ignored")
+    val desc = new ApplicationDescription("TestClient", Some(1), 512,
+      Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
     val listener = new TestListener
     val client = new AppClient(actorSystem, Array(url), desc, listener, new SparkConf)
     client.start()
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index a8c9ac072449f..6d2d4cef1ee46 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -36,11 +36,11 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     conf.getInt("spark.history.updateInterval", 10)) * 1000
 
   private val logDir = conf.get("spark.history.fs.logDirectory", null)
-  if (logDir == null) {
-    throw new IllegalArgumentException("Logging directory must be specified.")
-  }
+  private val resolvedLogDir = Option(logDir)
+    .map { d => Utils.resolveURI(d) }
+    .getOrElse { throw new IllegalArgumentException("Logging directory must be specified.") }
 
-  private val fs = Utils.getHadoopFileSystem(logDir)
+  private val fs = Utils.getHadoopFileSystem(resolvedLogDir)
 
   // A timestamp of when the disk was last accessed to check for log updates
   private var lastLogCheckTimeMs = -1L
@@ -76,14 +76,14 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
   private def initialize() {
     // Validate the log directory.
-    val path = new Path(logDir)
+    val path = new Path(resolvedLogDir)
     if (!fs.exists(path)) {
       throw new IllegalArgumentException(
-        "Logging directory specified does not exist: %s".format(logDir))
+        "Logging directory specified does not exist: %s".format(resolvedLogDir))
     }
     if (!fs.getFileStatus(path).isDir) {
       throw new IllegalArgumentException(
-        "Logging directory specified is not a directory: %s".format(logDir))
+        "Logging directory specified is not a directory: %s".format(resolvedLogDir))
     }
 
     checkForLogs()
@@ -95,15 +95,16 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
   override def getAppUI(appId: String): SparkUI = {
     try {
-      val appLogDir = fs.getFileStatus(new Path(logDir, appId))
-      loadAppInfo(appLogDir, true)._2
+      val appLogDir = fs.getFileStatus(new Path(resolvedLogDir.toString, appId))
+      val (_, ui) = loadAppInfo(appLogDir, renderUI = true)
+      ui
     } catch {
       case e: FileNotFoundException => null
     }
   }
 
   override def getConfig(): Map[String, String] =
-    Map(("Event Log Location" -> logDir))
+    Map("Event Log Location" -> resolvedLogDir.toString)
 
   /**
    * Builds the application list based on the current contents of the log directory.
@@ -114,14 +115,14 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     lastLogCheckTimeMs = getMonotonicTimeMs()
     logDebug("Checking for logs. Time is now %d.".format(lastLogCheckTimeMs))
     try {
-      val logStatus = fs.listStatus(new Path(logDir))
+      val logStatus = fs.listStatus(new Path(resolvedLogDir))
       val logDirs = if (logStatus != null) logStatus.filter(_.isDir).toSeq else Seq[FileStatus]()
-      val logInfos = logDirs.filter {
-        dir => fs.isFile(new Path(dir.getPath(), EventLoggingListener.APPLICATION_COMPLETE))
+      val logInfos = logDirs.filter { dir =>
+        fs.isFile(new Path(dir.getPath, EventLoggingListener.APPLICATION_COMPLETE))
       }
 
       val currentApps = Map[String, ApplicationHistoryInfo](
-        appList.map(app => (app.id -> app)):_*)
+        appList.map(app => app.id -> app):_*)
 
       // For any application that either (i) is not listed or (ii) has changed since the last time
       // the listing was created (defined by the log dir's modification time), load the app's info.
@@ -131,7 +132,8 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
         val curr = currentApps.getOrElse(dir.getPath().getName(), null)
         if (curr == null || curr.lastUpdated < getModificationTime(dir)) {
           try {
-            newApps += loadAppInfo(dir, false)._1
+            val (app, _) = loadAppInfo(dir, renderUI = false)
+            newApps += app
           } catch {
             case e: Exception => logError(s"Failed to load app info from directory $dir.")
           }
@@ -159,9 +161,9 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    * @return A 2-tuple `(app info, ui)`. `ui` will be null if `renderUI` is false.
    */
   private def loadAppInfo(logDir: FileStatus, renderUI: Boolean) = {
-    val elogInfo = EventLoggingListener.parseLoggingInfo(logDir.getPath(), fs)
     val path = logDir.getPath
     val appId = path.getName
+    val elogInfo = EventLoggingListener.parseLoggingInfo(path, fs)
     val replayBus = new ReplayListenerBus(elogInfo.logPaths, fs, elogInfo.compressionCodec)
     val appListener = new ApplicationEventListener
     replayBus.addListener(appListener)
@@ -169,7 +171,8 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     val ui: SparkUI = if (renderUI) {
         val conf = this.conf.clone()
         val appSecManager = new SecurityManager(conf)
-        new SparkUI(conf, appSecManager, replayBus, appId, "/history/" + appId)
+        new SparkUI(conf, appSecManager, replayBus, appId,
+          HistoryServer.UI_PATH_PREFIX + s"/$appId")
         // Do not call ui.bind() to avoid creating a new server for each application
       } else {
         null
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index a958c837c2ff6..c4ef8b63b0071 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -45,7 +45,7 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
       <div class="row-fluid">
         <div class="span12">
           <ul class="unstyled">
-            { providerConfig.map(e => <li><strong>{e._1}:</strong> {e._2}</li>) }
+            {providerConfig.map { case (k, v) => <li><strong>{k}:</strong> {v}</li> }}
           </ul>
           {
             if (allApps.size > 0) {
@@ -75,7 +75,7 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
     "Last Updated")
 
   private def appRow(info: ApplicationHistoryInfo): Seq[Node] = {
-    val uiAddress = "/history/" + info.id
+    val uiAddress = HistoryServer.UI_PATH_PREFIX + s"/${info.id}"
     val startTime = UIUtils.formatDate(info.startTime)
     val endTime = UIUtils.formatDate(info.endTime)
     val duration = UIUtils.formatDuration(info.endTime - info.startTime)
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 56b38ddfc9313..d1a64c1912cb8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -25,9 +25,9 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.ui.{WebUI, SparkUI, UIUtils}
+import org.apache.spark.ui.{SparkUI, UIUtils, WebUI}
 import org.apache.spark.ui.JettyUtils._
-import org.apache.spark.util.{SignalLogger, Utils}
+import org.apache.spark.util.SignalLogger
 
 /**
  * A web server that renders SparkUIs of completed applications.
@@ -114,7 +114,7 @@ class HistoryServer(
     attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, "/static"))
 
     val contextHandler = new ServletContextHandler
-    contextHandler.setContextPath("/history")
+    contextHandler.setContextPath(HistoryServer.UI_PATH_PREFIX)
     contextHandler.addServlet(new ServletHolder(loaderServlet), "/*")
     attachHandler(contextHandler)
   }
@@ -172,10 +172,12 @@ class HistoryServer(
 object HistoryServer extends Logging {
   private val conf = new SparkConf
 
+  val UI_PATH_PREFIX = "/history"
+
   def main(argStrings: Array[String]) {
     SignalLogger.register(log)
     initSecurity()
-    val args = new HistoryServerArguments(conf, argStrings)
+    new HistoryServerArguments(conf, argStrings)
     val securityManager = new SecurityManager(conf)
 
     val providerName = conf.getOption("spark.history.provider")
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
index be9361b754fc3..25fc76c23e0fb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.deploy.history
 
 import org.apache.spark.SparkConf
-import org.apache.spark.util.Utils
 
 /**
  * Command-line parser for the master.
@@ -32,6 +31,7 @@ private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]
     args match {
       case ("--dir" | "-d") :: value :: tail =>
         logDir = value
+        conf.set("spark.history.fs.logDirectory", value)
         parse(tail)
 
       case ("--help" | "-h") :: tail =>
@@ -42,9 +42,6 @@ private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]
       case _ =>
         printUsageAndExit(1)
     }
-    if (logDir != null) {
-      conf.set("spark.history.fs.logDirectory", logDir)
-    }
   }
 
   private def printUsageAndExit(exitCode: Int) {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index a304102a49086..a70ecdb375373 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy.master
 
+import java.net.URLEncoder
 import java.text.SimpleDateFormat
 import java.util.Date
 
@@ -30,11 +31,11 @@ import akka.actor._
 import akka.pattern.ask
 import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
 import akka.serialization.SerializationExtension
-import org.apache.hadoop.fs.FileSystem
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.{ApplicationDescription, DriverDescription, ExecutorState}
 import org.apache.spark.deploy.DeployMessages._
+import org.apache.spark.deploy.history.HistoryServer
 import org.apache.spark.deploy.master.DriverState.DriverState
 import org.apache.spark.deploy.master.MasterMessages._
 import org.apache.spark.deploy.master.ui.MasterWebUI
@@ -57,6 +58,7 @@ private[spark] class Master(
   def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")  // For application IDs
   val WORKER_TIMEOUT = conf.getLong("spark.worker.timeout", 60) * 1000
   val RETAINED_APPLICATIONS = conf.getInt("spark.deploy.retainedApplications", 200)
+  val RETAINED_DRIVERS = conf.getInt("spark.deploy.retainedDrivers", 200)
   val REAPER_ITERATIONS = conf.getInt("spark.dead.worker.persistence", 15)
   val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")
   val RECOVERY_MODE = conf.get("spark.deploy.recoveryMode", "NONE")
@@ -72,9 +74,7 @@ private[spark] class Master(
   val waitingApps = new ArrayBuffer[ApplicationInfo]
   val completedApps = new ArrayBuffer[ApplicationInfo]
   var nextAppNumber = 0
-
   val appIdToUI = new HashMap[String, SparkUI]
-  val fileSystemsUsed = new HashSet[FileSystem]
 
   val drivers = new HashSet[DriverInfo]
   val completedDrivers = new ArrayBuffer[DriverInfo]
@@ -154,12 +154,13 @@ private[spark] class Master(
   }
 
   override def postStop() {
+    masterMetricsSystem.report()
+    applicationMetricsSystem.report()
     // prevent the CompleteRecovery message sending to restarted master
     if (recoveryCompletionTask != null) {
       recoveryCompletionTask.cancel()
     }
     webUi.stop()
-    fileSystemsUsed.foreach(_.close())
     masterMetricsSystem.stop()
     applicationMetricsSystem.stop()
     persistenceEngine.close()
@@ -644,10 +645,7 @@ private[spark] class Master(
       waitingApps -= app
 
       // If application events are logged, use them to rebuild the UI
-      if (!rebuildSparkUI(app)) {
-        // Avoid broken links if the UI is not reconstructed
-        app.desc.appUiUrl = ""
-      }
+      rebuildSparkUI(app)
 
       for (exec <- app.executors.values) {
         exec.worker.removeExecutor(exec)
@@ -669,29 +667,49 @@ private[spark] class Master(
    */
   def rebuildSparkUI(app: ApplicationInfo): Boolean = {
     val appName = app.desc.name
-    val eventLogDir = app.desc.eventLogDir.getOrElse { return false }
+    val notFoundBasePath = HistoryServer.UI_PATH_PREFIX + "/not-found"
+    val eventLogDir = app.desc.eventLogDir.getOrElse {
+      // Event logging is not enabled for this application
+      app.desc.appUiUrl = notFoundBasePath
+      return false
+    }
     val fileSystem = Utils.getHadoopFileSystem(eventLogDir)
     val eventLogInfo = EventLoggingListener.parseLoggingInfo(eventLogDir, fileSystem)
     val eventLogPaths = eventLogInfo.logPaths
     val compressionCodec = eventLogInfo.compressionCodec
-    if (!eventLogPaths.isEmpty) {
-      try {
-        val replayBus = new ReplayListenerBus(eventLogPaths, fileSystem, compressionCodec)
-        val ui = new SparkUI(
-          new SparkConf, replayBus, appName + " (completed)", "/history/" + app.id)
-        replayBus.replay()
-        app.desc.appUiUrl = ui.basePath
-        appIdToUI(app.id) = ui
-        webUi.attachSparkUI(ui)
-        return true
-      } catch {
-        case e: Exception =>
-          logError("Exception in replaying log for application %s (%s)".format(appName, app.id), e)
-      }
-    } else {
-      logWarning("Application %s (%s) has no valid logs: %s".format(appName, app.id, eventLogDir))
+
+    if (eventLogPaths.isEmpty) {
+      // Event logging is enabled for this application, but no event logs are found
+      val title = s"Application history not found (${app.id})"
+      var msg = s"No event logs found for application $appName in $eventLogDir."
+      logWarning(msg)
+      msg += " Did you specify the correct logging directory?"
+      msg = URLEncoder.encode(msg, "UTF-8")
+      app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&title=$title"
+      return false
+    }
+
+    try {
+      val replayBus = new ReplayListenerBus(eventLogPaths, fileSystem, compressionCodec)
+      val ui = new SparkUI(new SparkConf, replayBus, appName + " (completed)",
+        HistoryServer.UI_PATH_PREFIX + s"/${app.id}")
+      replayBus.replay()
+      appIdToUI(app.id) = ui
+      webUi.attachSparkUI(ui)
+      // Application UI is successfully rebuilt, so link the Master UI to it
+      app.desc.appUiUrl = ui.basePath
+      true
+    } catch {
+      case e: Exception =>
+        // Relay exception message to application UI page
+        val title = s"Application history load error (${app.id})"
+        val exception = URLEncoder.encode(Utils.exceptionString(e), "UTF-8")
+        var msg = s"Exception in replaying log for application $appName!"
+        logError(msg, e)
+        msg = URLEncoder.encode(msg, "UTF-8")
+        app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&exception=$exception&title=$title"
+        false
     }
-    false
   }
 
   /** Generate a new app ID given a app's submission date */
@@ -744,11 +762,16 @@ private[spark] class Master(
       case Some(driver) =>
         logInfo(s"Removing driver: $driverId")
         drivers -= driver
+        if (completedDrivers.size >= RETAINED_DRIVERS) {
+          val toRemove = math.max(RETAINED_DRIVERS / 10, 1)
+          completedDrivers.trimStart(toRemove)
+        }
         completedDrivers += driver
         persistenceEngine.removeDriver(driver)
         driver.state = finalState
         driver.exception = exception
         driver.worker.foreach(w => w.removeDriver(driver))
+        schedule()
       case None =>
         logWarning(s"Asked to remove unknown driver: $driverId")
     }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index 34fa1429c86de..4588c130ef439 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -28,7 +28,7 @@ import org.json4s.JValue
 import org.apache.spark.deploy.{ExecutorState, JsonProtocol}
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
 import org.apache.spark.deploy.master.ExecutorInfo
-import org.apache.spark.ui.{WebUIPage, UIUtils}
+import org.apache.spark.ui.{UIUtils, WebUIPage}
 import org.apache.spark.util.Utils
 
 private[spark] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/HistoryNotFoundPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/HistoryNotFoundPage.scala
new file mode 100644
index 0000000000000..d8daff3e7fb9c
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/HistoryNotFoundPage.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.master.ui
+
+import java.net.URLDecoder
+import javax.servlet.http.HttpServletRequest
+
+import scala.xml.Node
+
+import org.apache.spark.ui.{UIUtils, WebUIPage}
+
+private[spark] class HistoryNotFoundPage(parent: MasterWebUI)
+  extends WebUIPage("history/not-found") {
+
+  /**
+   * Render a page that conveys failure in loading application history.
+   *
+   * This accepts 3 HTTP parameters:
+   *   msg = message to display to the user
+   *   title = title of the page
+   *   exception = detailed description of the exception in loading application history (if any)
+   *
+   * Parameters "msg" and "exception" are assumed to be UTF-8 encoded.
+   */
+  def render(request: HttpServletRequest): Seq[Node] = {
+    val titleParam = request.getParameter("title")
+    val msgParam = request.getParameter("msg")
+    val exceptionParam = request.getParameter("exception")
+
+    // If no parameters are specified, assume the user did not enable event logging
+    val defaultTitle = "Event logging is not enabled"
+    val defaultContent =
+      <div class="row-fluid">
+        <div class="span12" style="font-size:14px">
+          No event logs were found for this application! To
+          <a href="http://spark.apache.org/docs/latest/monitoring.html">enable event logging</a>,
+          set <span style="font-style:italic">spark.eventLog.enabled</span> to true and
+          <span style="font-style:italic">spark.eventLog.dir</span> to the directory to which your
+          event logs are written.
+        </div>
+      </div>
+
+    val title = Option(titleParam).getOrElse(defaultTitle)
+    val content = Option(msgParam)
+      .map { msg => URLDecoder.decode(msg, "UTF-8") }
+      .map { msg =>
+        <div class="row-fluid">
+          <div class="span12" style="font-size:14px">{msg}</div>
+        </div> ++
+        Option(exceptionParam)
+          .map { e => URLDecoder.decode(e, "UTF-8") }
+          .map { e => <pre>{e}</pre> }
+          .getOrElse(Seq.empty)
+      }.getOrElse(defaultContent)
+
+    UIUtils.basicSparkPage(content, title)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
index a18b39fc95d64..16aa0493370dd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
@@ -21,7 +21,7 @@ import org.apache.spark.Logging
 import org.apache.spark.deploy.master.Master
 import org.apache.spark.ui.{SparkUI, WebUI}
 import org.apache.spark.ui.JettyUtils._
-import org.apache.spark.util.{AkkaUtils, Utils}
+import org.apache.spark.util.AkkaUtils
 
 /**
  * Web UI server for the standalone master.
@@ -38,6 +38,7 @@ class MasterWebUI(val master: Master, requestedPort: Int)
   /** Initialize all components of the server. */
   def initialize() {
     attachPage(new ApplicationPage(this))
+    attachPage(new HistoryNotFoundPage(this))
     attachPage(new MasterPage(this))
     attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
     master.masterMetricsSystem.getServletHandlers.foreach(attachHandler)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
index 4af5bc3afad6c..687e492a0d6fc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
@@ -47,7 +47,6 @@ object CommandUtils extends Logging {
    */
   def buildJavaOpts(command: Command, memory: Int, sparkHome: String): Seq[String] = {
     val memoryOpts = Seq(s"-Xms${memory}M", s"-Xmx${memory}M")
-    val extraOpts = command.extraJavaOptions.map(Utils.splitCommandString).getOrElse(Seq())
 
     // Exists for backwards compatibility with older Spark versions
     val workerLocalOpts = Option(getenv("SPARK_JAVA_OPTS")).map(Utils.splitCommandString)
@@ -62,7 +61,7 @@ object CommandUtils extends Logging {
         val joined = command.libraryPathEntries.mkString(File.pathSeparator)
         Seq(s"-Djava.library.path=$joined")
       } else {
-         Seq()
+        Seq()
       }
 
     val permGenOpt = Seq("-XX:MaxPermSize=128m")
@@ -71,11 +70,11 @@ object CommandUtils extends Logging {
     val ext = if (System.getProperty("os.name").startsWith("Windows")) ".cmd" else ".sh"
     val classPath = Utils.executeAndGetOutput(
       Seq(sparkHome + "/bin/compute-classpath" + ext),
-      extraEnvironment=command.environment)
+      extraEnvironment = command.environment)
     val userClassPath = command.classPathEntries ++ Seq(classPath)
 
     Seq("-cp", userClassPath.filterNot(_.isEmpty).mkString(File.pathSeparator)) ++
-      permGenOpt ++ libraryOpts ++ extraOpts ++ workerLocalOpts ++ memoryOpts
+      permGenOpt ++ libraryOpts ++ workerLocalOpts ++ command.javaOpts ++ memoryOpts
   }
 
   /** Spawn a thread that will redirect a given stream to a file */
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index 662d37871e7a6..5caaf6bea3575 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -36,6 +36,7 @@ import org.apache.spark.deploy.master.DriverState.DriverState
 
 /**
  * Manages the execution of one driver, including automatically restarting the driver on failure.
+ * This is currently only used in standalone cluster deploy mode.
  */
 private[spark] class DriverRunner(
     val driverId: String,
@@ -81,7 +82,7 @@ private[spark] class DriverRunner(
             driverDesc.command.environment,
             classPath,
             driverDesc.command.libraryPathEntries,
-            driverDesc.command.extraJavaOptions)
+            driverDesc.command.javaOpts)
           val command = CommandUtils.buildCommandSeq(newCommand, driverDesc.mem,
             sparkHome.getAbsolutePath)
           launchDriver(command, driverDesc.command.environment, driverDir, driverDesc.supervise)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index 467317dd9b44c..7be89f9aff0f3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -30,6 +30,7 @@ import org.apache.spark.util.logging.FileAppender
 
 /**
  * Manages the execution of one executor process.
+ * This is currently only used in standalone mode.
  */
 private[spark] class ExecutorRunner(
     val appId: String,
@@ -72,7 +73,7 @@ private[spark] class ExecutorRunner(
   }
 
   /**
-   * kill executor process, wait for exit and notify worker to update resource status
+   * Kill executor process, wait for exit and notify worker to update resource status.
    *
    * @param message the exception message which caused the executor's death 
    */
@@ -114,10 +115,13 @@ private[spark] class ExecutorRunner(
   }
 
   def getCommandSeq = {
-    val command = Command(appDesc.command.mainClass,
-      appDesc.command.arguments.map(substituteVariables) ++ Seq(appId), appDesc.command.environment,
-      appDesc.command.classPathEntries, appDesc.command.libraryPathEntries,
-      appDesc.command.extraJavaOptions)
+    val command = Command(
+      appDesc.command.mainClass,
+      appDesc.command.arguments.map(substituteVariables) ++ Seq(appId),
+      appDesc.command.environment,
+      appDesc.command.classPathEntries,
+      appDesc.command.libraryPathEntries,
+      appDesc.command.javaOpts)
     CommandUtils.buildCommandSeq(command, memory, sparkHome.getAbsolutePath)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index ce425443051b0..458d9947bd873 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -71,7 +71,7 @@ private[spark] class Worker(
   // TTL for app folders/data;  after TTL expires it will be cleaned up
   val APP_DATA_RETENTION_SECS = conf.getLong("spark.worker.cleanup.appDataTtl", 7 * 24 * 3600)
 
-
+  val testing: Boolean = sys.props.contains("spark.testing")
   val masterLock: Object = new Object()
   var master: ActorSelection = null
   var masterAddress: Address = null
@@ -81,7 +81,13 @@ private[spark] class Worker(
   @volatile var registered = false
   @volatile var connected = false
   val workerId = generateWorkerId()
-  val sparkHome = new File(Option(System.getenv("SPARK_HOME")).getOrElse("."))
+  val sparkHome =
+    if (testing) {
+      assert(sys.props.contains("spark.test.home"), "spark.test.home is not set!")
+      new File(sys.props("spark.test.home"))
+    } else {
+      new File(sys.env.get("SPARK_HOME").getOrElse("."))
+    }
   var workDir: File = null
   val executors = new HashMap[String, ExecutorRunner]
   val finishedExecutors = new HashMap[String, ExecutorRunner]
@@ -233,9 +239,7 @@ private[spark] class Worker(
         try {
           logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
           val manager = new ExecutorRunner(appId, execId, appDesc, cores_, memory_,
-            self, workerId, host,
-            appDesc.sparkHome.map(userSparkHome => new File(userSparkHome)).getOrElse(sparkHome),
-            workDir, akkaUrl, conf, ExecutorState.RUNNING)
+            self, workerId, host, sparkHome, workDir, akkaUrl, conf, ExecutorState.RUNNING)
           executors(appId + "/" + execId) = manager
           manager.start()
           coresUsed += cores_
@@ -357,6 +361,7 @@ private[spark] class Worker(
   }
 
   override def postStop() {
+    metricsSystem.report()
     registrationRetryTimer.foreach(_.cancel())
     executors.values.foreach(_.kill())
     drivers.values.foreach(_.kill())
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
index b389cb546de6c..ecb358c399819 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.deploy.worker.ui
 
-import java.io.File
 import javax.servlet.http.HttpServletRequest
 
 import scala.xml.Node
@@ -25,7 +24,7 @@ import scala.xml.Node
 import org.apache.spark.ui.{WebUIPage, UIUtils}
 import org.apache.spark.util.Utils
 import org.apache.spark.Logging
-import org.apache.spark.util.logging.{FileAppender, RollingFileAppender}
+import org.apache.spark.util.logging.RollingFileAppender
 
 private[spark] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with Logging {
   private val worker = parent.worker
@@ -64,11 +63,11 @@ private[spark] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") w
     val offset = Option(request.getParameter("offset")).map(_.toLong)
     val byteLength = Option(request.getParameter("byteLength")).map(_.toInt).getOrElse(defaultBytes)
 
-    val (logDir, params) = (appId, executorId, driverId) match {
+    val (logDir, params, pageName) = (appId, executorId, driverId) match {
       case (Some(a), Some(e), None) =>
-        (s"${workDir.getPath}/$a/$e/", s"appId=$a&executorId=$e")
+        (s"${workDir.getPath}/$a/$e/", s"appId=$a&executorId=$e", s"$a/$e")
       case (None, None, Some(d)) =>
-        (s"${workDir.getPath}/$d/", s"driverId=$d")
+        (s"${workDir.getPath}/$d/", s"driverId=$d", d)
       case _ =>
         throw new Exception("Request must specify either application or driver identifiers")
     }
@@ -120,7 +119,7 @@ private[spark] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") w
           </div>
         </body>
       </html>
-    UIUtils.basicSparkPage(content, logType + " log page for " + appId.getOrElse("unknown app"))
+    UIUtils.basicSparkPage(content, logType + " log page for " + pageName)
   }
 
   /** Get the part of the log files given the offset and desired length of bytes */
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 8d31bd05fdbec..af736de405397 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -71,7 +71,7 @@ private[spark] class CoarseGrainedExecutorBackend(
         val ser = SparkEnv.get.closureSerializer.newInstance()
         val taskDesc = ser.deserialize[TaskDescription](data.value)
         logInfo("Got assigned task " + taskDesc.taskId)
-        executor.launchTask(this, taskDesc.taskId, taskDesc.serializedTask)
+        executor.launchTask(this, taskDesc.taskId, taskDesc.name, taskDesc.serializedTask)
       }
 
     case KillTask(taskId, _, interruptThread) =>
@@ -88,6 +88,7 @@ private[spark] class CoarseGrainedExecutorBackend(
 
     case StopExecutor =>
       logInfo("Driver commanded a shutdown")
+      executor.stop()
       context.stop(self)
       context.system.shutdown()
   }
@@ -98,8 +99,13 @@ private[spark] class CoarseGrainedExecutorBackend(
 }
 
 private[spark] object CoarseGrainedExecutorBackend extends Logging {
-  def run(driverUrl: String, executorId: String, hostname: String, cores: Int,
-    workerUrl: Option[String]) {
+
+  private def run(
+      driverUrl: String,
+      executorId: String,
+      hostname: String,
+      cores: Int,
+      workerUrl: Option[String]) {
 
     SignalLogger.register(log)
 
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 4d3ba11633bf5..1bb1b4aae91bb 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -23,7 +23,7 @@ import java.nio.ByteBuffer
 import java.util.concurrent._
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.HashMap
+import scala.collection.mutable.{ArrayBuffer, HashMap}
 
 import org.apache.spark._
 import org.apache.spark.scheduler._
@@ -48,6 +48,8 @@ private[spark] class Executor(
 
   private val EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new Array[Byte](0))
 
+  @volatile private var isStopped = false
+
   // No ip or host:port - just hostname
   Utils.checkHost(slaveHostname, "Expected executed slave to be a hostname")
   // must not have port specified.
@@ -107,8 +109,11 @@ private[spark] class Executor(
   // Maintains the list of running tasks.
   private val runningTasks = new ConcurrentHashMap[Long, TaskRunner]
 
-  def launchTask(context: ExecutorBackend, taskId: Long, serializedTask: ByteBuffer) {
-    val tr = new TaskRunner(context, taskId, serializedTask)
+  startDriverHeartbeater()
+
+  def launchTask(
+      context: ExecutorBackend, taskId: Long, taskName: String, serializedTask: ByteBuffer) {
+    val tr = new TaskRunner(context, taskId, taskName, serializedTask)
     runningTasks.put(taskId, tr)
     threadPool.execute(tr)
   }
@@ -120,6 +125,12 @@ private[spark] class Executor(
     }
   }
 
+  def stop() {
+    env.metricsSystem.report()
+    isStopped = true
+    threadPool.shutdown()
+  }
+
   /** Get the Yarn approved local directories. */
   private def getYarnLocalDirs(): String = {
     // Hadoop 0.23 and 2.x have different Environment variable names for the
@@ -135,14 +146,16 @@ private[spark] class Executor(
     localDirs
   }
 
-  class TaskRunner(execBackend: ExecutorBackend, taskId: Long, serializedTask: ByteBuffer)
+  class TaskRunner(
+      execBackend: ExecutorBackend, val taskId: Long, taskName: String, serializedTask: ByteBuffer)
     extends Runnable {
 
     @volatile private var killed = false
-    @volatile private var task: Task[Any] = _
+    @volatile var task: Task[Any] = _
+    @volatile var attemptedTask: Option[Task[Any]] = None
 
     def kill(interruptThread: Boolean) {
-      logInfo("Executor is trying to kill task " + taskId)
+      logInfo(s"Executor is trying to kill $taskName (TID $taskId)")
       killed = true
       if (task != null) {
         task.kill(interruptThread)
@@ -154,9 +167,8 @@ private[spark] class Executor(
       SparkEnv.set(env)
       Thread.currentThread.setContextClassLoader(replClassLoader)
       val ser = SparkEnv.get.closureSerializer.newInstance()
-      logInfo("Running task ID " + taskId)
+      logInfo(s"Running $taskName (TID $taskId)")
       execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER)
-      var attemptedTask: Option[Task[Any]] = None
       var taskStart: Long = 0
       def gcTime = ManagementFactory.getGarbageCollectorMXBeans.map(_.getCollectionTime).sum
       val startGCTime = gcTime
@@ -198,7 +210,6 @@ private[spark] class Executor(
         val afterSerialization = System.currentTimeMillis()
 
         for (m <- task.metrics) {
-          m.hostname = Utils.localHostName()
           m.executorDeserializeTime = taskStart - startTime
           m.executorRunTime = taskFinish - taskStart
           m.jvmGCTime = gcTime - startGCTime
@@ -207,25 +218,30 @@ private[spark] class Executor(
 
         val accumUpdates = Accumulators.values
 
-        val directResult = new DirectTaskResult(valueBytes, accumUpdates,
-          task.metrics.getOrElse(null))
+        val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.orNull)
         val serializedDirectResult = ser.serialize(directResult)
-        logInfo("Serialized size of result for " + taskId + " is " + serializedDirectResult.limit)
-        val serializedResult = {
-          if (serializedDirectResult.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
-            logInfo("Storing result for " + taskId + " in local BlockManager")
+        val resultSize = serializedDirectResult.limit
+
+        // directSend = sending directly back to the driver
+        val (serializedResult, directSend) = {
+          if (resultSize >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
             val blockId = TaskResultBlockId(taskId)
             env.blockManager.putBytes(
               blockId, serializedDirectResult, StorageLevel.MEMORY_AND_DISK_SER)
-            ser.serialize(new IndirectTaskResult[Any](blockId))
+            (ser.serialize(new IndirectTaskResult[Any](blockId)), false)
           } else {
-            logInfo("Sending result for " + taskId + " directly to driver")
-            serializedDirectResult
+            (serializedDirectResult, true)
           }
         }
 
         execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)
-        logInfo("Finished task ID " + taskId)
+
+        if (directSend) {
+          logInfo(s"Finished $taskName (TID $taskId). $resultSize bytes result sent to driver")
+        } else {
+          logInfo(
+            s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")
+        }
       } catch {
         case ffe: FetchFailedException => {
           val reason = ffe.toTaskEndReason
@@ -233,7 +249,7 @@ private[spark] class Executor(
         }
 
         case _: TaskKilledException | _: InterruptedException if task.killed => {
-          logInfo("Executor killed task " + taskId)
+          logInfo(s"Executor killed $taskName (TID $taskId)")
           execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
         }
 
@@ -241,7 +257,7 @@ private[spark] class Executor(
           // Attempt to exit cleanly by informing the driver of our failure.
           // If anything goes wrong (or this was a fatal exception), we will delegate to
           // the default uncaught exception handler, which will terminate the Executor.
-          logError("Exception in task ID " + taskId, t)
+          logError(s"Exception in $taskName (TID $taskId)", t)
 
           val serviceTime = System.currentTimeMillis() - taskStart
           val metrics = attemptedTask.flatMap(t => t.metrics)
@@ -249,7 +265,7 @@ private[spark] class Executor(
             m.executorRunTime = serviceTime
             m.jvmGCTime = gcTime - startGCTime
           }
-          val reason = ExceptionFailure(t.getClass.getName, t.toString, t.getStackTrace, metrics)
+          val reason = ExceptionFailure(t.getClass.getName, t.getMessage, t.getStackTrace, metrics)
           execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
 
           // Don't forcibly exit unless the exception was inherently fatal, to avoid
@@ -259,11 +275,13 @@ private[spark] class Executor(
           }
         }
       } finally {
-        // TODO: Unregister shuffle memory only for ResultTask
+        // Release memory used by this thread for shuffles
         val shuffleMemoryMap = env.shuffleMemoryMap
         shuffleMemoryMap.synchronized {
           shuffleMemoryMap.remove(Thread.currentThread().getId)
         }
+        // Release memory used by this thread for unrolling blocks
+        env.blockManager.memoryStore.releaseUnrollMemoryForThisThread()
         runningTasks.remove(taskId)
       }
     }
@@ -341,4 +359,42 @@ private[spark] class Executor(
       }
     }
   }
+
+  def startDriverHeartbeater() {
+    val interval = conf.getInt("spark.executor.heartbeatInterval", 10000)
+    val timeout = AkkaUtils.lookupTimeout(conf)
+    val retryAttempts = AkkaUtils.numRetries(conf)
+    val retryIntervalMs = AkkaUtils.retryWaitMs(conf)
+    val heartbeatReceiverRef = AkkaUtils.makeDriverRef("HeartbeatReceiver", conf, env.actorSystem)
+
+    val t = new Thread() {
+      override def run() {
+        // Sleep a random interval so the heartbeats don't end up in sync
+        Thread.sleep(interval + (math.random * interval).asInstanceOf[Int])
+
+        while (!isStopped) {
+          val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]()
+          for (taskRunner <- runningTasks.values()) {
+            if (!taskRunner.attemptedTask.isEmpty) {
+              Option(taskRunner.task).flatMap(_.metrics).foreach { metrics =>
+                tasksMetrics += ((taskRunner.taskId, metrics))
+              }
+            }
+          }
+
+          val message = Heartbeat(executorId, tasksMetrics.toArray, env.blockManager.blockManagerId)
+          val response = AkkaUtils.askWithReply[HeartbeatResponse](message, heartbeatReceiverRef,
+            retryAttempts, retryIntervalMs, timeout)
+          if (response.reregisterBlockManager) {
+            logWarning("Told to re-register on heartbeat")
+            env.blockManager.reregister()
+          }
+          Thread.sleep(interval)
+        }
+      }
+    }
+    t.setDaemon(true)
+    t.setName("Driver Heartbeater")
+    t.start()
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
index 2232e6237bf26..a42c8b43bbf7f 100644
--- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -64,7 +64,7 @@ private[spark] class MesosExecutorBackend
     if (executor == null) {
       logError("Received launchTask but executor was null")
     } else {
-      executor.launchTask(this, taskId, taskInfo.getData.asReadOnlyByteBuffer)
+      executor.launchTask(this, taskId, taskInfo.getName, taskInfo.getData.asReadOnlyByteBuffer)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index ac73288442a74..56cd8723a3a22 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -23,6 +23,14 @@ import org.apache.spark.storage.{BlockId, BlockStatus}
 /**
  * :: DeveloperApi ::
  * Metrics tracked during the execution of a task.
+ *
+ * This class is used to house metrics both for in-progress and completed tasks. In executors,
+ * both the task thread and the heartbeat thread write to the TaskMetrics. The heartbeat thread
+ * reads it to send in-progress metrics, and the task thread reads it to send metrics along with
+ * the completed task.
+ *
+ * So, when adding new fields, take into consideration that the whole object can be serialized for
+ * shipping off at any time to consumers of the SparkListener interface.
  */
 @DeveloperApi
 class TaskMetrics extends Serializable {
@@ -75,7 +83,9 @@ class TaskMetrics extends Serializable {
   /**
    * If this task reads from shuffle output, metrics on getting shuffle data will be collected here
    */
-  var shuffleReadMetrics: Option[ShuffleReadMetrics] = None
+  private var _shuffleReadMetrics: Option[ShuffleReadMetrics] = None
+
+  def shuffleReadMetrics = _shuffleReadMetrics
 
   /**
    * If this task writes to shuffle output, metrics on the written shuffle data will be collected
@@ -87,6 +97,21 @@ class TaskMetrics extends Serializable {
    * Storage statuses of any blocks that have been updated as a result of this task.
    */
   var updatedBlocks: Option[Seq[(BlockId, BlockStatus)]] = None
+
+  /** Adds the given ShuffleReadMetrics to any existing shuffle metrics for this task. */
+  def updateShuffleReadMetrics(newMetrics: ShuffleReadMetrics) = synchronized {
+    _shuffleReadMetrics match {
+      case Some(existingMetrics) =>
+        existingMetrics.shuffleFinishTime = math.max(
+          existingMetrics.shuffleFinishTime, newMetrics.shuffleFinishTime)
+        existingMetrics.fetchWaitTime += newMetrics.fetchWaitTime
+        existingMetrics.localBlocksFetched += newMetrics.localBlocksFetched
+        existingMetrics.remoteBlocksFetched += newMetrics.remoteBlocksFetched
+        existingMetrics.remoteBytesRead += newMetrics.remoteBytesRead
+      case None =>
+        _shuffleReadMetrics = Some(newMetrics)
+    }
+  }
 }
 
 private[spark] object TaskMetrics {
@@ -126,12 +151,12 @@ class ShuffleReadMetrics extends Serializable {
   /**
    * Absolute time when this task finished reading shuffle data
    */
-  var shuffleFinishTime: Long = _
+  var shuffleFinishTime: Long = -1
 
   /**
    * Number of blocks fetched in this shuffle by this task (remote or local)
    */
-  var totalBlocksFetched: Int = _
+  def totalBlocksFetched: Int = remoteBlocksFetched + localBlocksFetched
 
   /**
    * Number of remote blocks fetched in this shuffle by this task
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 4b0fe1ab82999..1b66218d86dd9 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -20,6 +20,7 @@ package org.apache.spark.io
 import java.io.{InputStream, OutputStream}
 
 import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream}
+import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream}
 import org.xerial.snappy.{SnappyInputStream, SnappyOutputStream}
 
 import org.apache.spark.SparkConf
@@ -55,7 +56,28 @@ private[spark] object CompressionCodec {
     ctor.newInstance(conf).asInstanceOf[CompressionCodec]
   }
 
-  val DEFAULT_COMPRESSION_CODEC = classOf[LZFCompressionCodec].getName
+  val DEFAULT_COMPRESSION_CODEC = classOf[SnappyCompressionCodec].getName
+}
+
+
+/**
+ * :: DeveloperApi ::
+ * LZ4 implementation of [[org.apache.spark.io.CompressionCodec]].
+ * Block size can be configured by `spark.io.compression.lz4.block.size`.
+ *
+ * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
+ *       of Spark. This is intended for use as an internal compression utility within a single Spark
+ *       application.
+ */
+@DeveloperApi
+class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
+
+  override def compressedOutputStream(s: OutputStream): OutputStream = {
+    val blockSize = conf.getInt("spark.io.compression.lz4.block.size", 32768)
+    new LZ4BlockOutputStream(s, blockSize)
+  }
+
+  override def compressedInputStream(s: InputStream): InputStream = new LZ4BlockInputStream(s)
 }
 
 
@@ -81,7 +103,7 @@ class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
 /**
  * :: DeveloperApi ::
  * Snappy implementation of [[org.apache.spark.io.CompressionCodec]].
- * Block size can be configured by spark.io.compression.snappy.block.size.
+ * Block size can be configured by `spark.io.compression.snappy.block.size`.
  *
  * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
  *       of Spark. This is intended for use as an internal compression utility within a single Spark
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 651511da1b7fe..6ef817d0e587e 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -91,6 +91,10 @@ private[spark] class MetricsSystem private (val instance: String,
     sinks.foreach(_.stop)
   }
 
+  def report(): Unit = {
+    sinks.foreach(_.report())
+  }
+
   def registerSource(source: Source) {
     sources += source
     try {
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala
index 05852f1f98993..81b9056b40fb8 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala
@@ -57,5 +57,9 @@ private[spark] class ConsoleSink(val property: Properties, val registry: MetricR
   override def stop() {
     reporter.stop()
   }
+
+  override def report() {
+    reporter.report()
+  }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala
index 542dce65366b2..9d5f2ae9328ad 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala
@@ -66,5 +66,9 @@ private[spark] class CsvSink(val property: Properties, val registry: MetricRegis
   override def stop() {
     reporter.stop()
   }
+
+  override def report() {
+    reporter.report()
+  }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala
index aeb4ad44a0647..d7b5f5c40efae 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala
@@ -81,4 +81,8 @@ private[spark] class GraphiteSink(val property: Properties, val registry: Metric
   override def stop() {
     reporter.stop()
   }
+
+  override def report() {
+    reporter.report()
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala
index ed27234b4e760..2588fe2c9edb8 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala
@@ -35,4 +35,6 @@ private[spark] class JmxSink(val property: Properties, val registry: MetricRegis
     reporter.stop()
   }
 
+  override def report() { }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
index 571539ba5e467..2f65bc8b46609 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
@@ -57,4 +57,6 @@ private[spark] class MetricsServlet(val property: Properties, val registry: Metr
   override def start() { }
 
   override def stop() { }
+
+  override def report() { }
 }
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/Sink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/Sink.scala
index 6f2b5a06027ea..0d83d8c425ca4 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/Sink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/Sink.scala
@@ -20,4 +20,5 @@ package org.apache.spark.metrics.sink
 private[spark] trait Sink {
   def start: Unit
   def stop: Unit
+  def report(): Unit
 }
diff --git a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala
index 8a1cdb812962e..566e8a4aaa1d2 100644
--- a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala
+++ b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala
@@ -62,13 +62,15 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf,
     conf.getInt("spark.core.connection.handler.threads.min", 20),
     conf.getInt("spark.core.connection.handler.threads.max", 60),
     conf.getInt("spark.core.connection.handler.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable]())
+    new LinkedBlockingDeque[Runnable](),
+    Utils.namedThreadFactory("handle-message-executor"))
 
   private val handleReadWriteExecutor = new ThreadPoolExecutor(
     conf.getInt("spark.core.connection.io.threads.min", 4),
     conf.getInt("spark.core.connection.io.threads.max", 32),
     conf.getInt("spark.core.connection.io.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable]())
+    new LinkedBlockingDeque[Runnable](),
+    Utils.namedThreadFactory("handle-read-write-executor"))
 
   // Use a different, yet smaller, thread pool - infrequently used with very short lived tasks :
   // which should be executed asap
@@ -76,7 +78,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf,
     conf.getInt("spark.core.connection.connect.threads.min", 1),
     conf.getInt("spark.core.connection.connect.threads.max", 8),
     conf.getInt("spark.core.connection.connect.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable]())
+    new LinkedBlockingDeque[Runnable](),
+    Utils.namedThreadFactory("handle-connect-executor"))
 
   private val serverChannel = ServerSocketChannel.open()
   // used to track the SendingConnections waiting to do SASL negotiation
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 5951865e56c9d..fabb882cdd4b3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -17,15 +17,16 @@
 
 package org.apache.spark.rdd
 
+import scala.language.existentials
+
 import java.io.{IOException, ObjectOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-import scala.language.existentials
 
 import org.apache.spark.{InterruptibleIterator, Partition, Partitioner, SparkEnv, TaskContext}
 import org.apache.spark.{Dependency, OneToOneDependency, ShuffleDependency}
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.util.collection.{ExternalAppendOnlyMap, AppendOnlyMap}
+import org.apache.spark.util.collection.{ExternalAppendOnlyMap, AppendOnlyMap, CompactBuffer}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.ShuffleHandle
 
@@ -66,14 +67,14 @@ private[spark] class CoGroupPartition(idx: Int, val deps: Array[CoGroupSplitDep]
  */
 @DeveloperApi
 class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part: Partitioner)
-  extends RDD[(K, Seq[Seq[_]])](rdds.head.context, Nil) {
+  extends RDD[(K, Array[Iterable[_]])](rdds.head.context, Nil) {
 
   // For example, `(k, a) cogroup (k, b)` produces k -> Seq(ArrayBuffer as, ArrayBuffer bs).
   // Each ArrayBuffer is represented as a CoGroup, and the resulting Seq as a CoGroupCombiner.
   // CoGroupValue is the intermediate state of each value before being merged in compute.
-  private type CoGroup = ArrayBuffer[Any]
+  private type CoGroup = CompactBuffer[Any]
   private type CoGroupValue = (Any, Int)  // Int is dependency number
-  private type CoGroupCombiner = Seq[CoGroup]
+  private type CoGroupCombiner = Array[CoGroup]
 
   private var serializer: Option[Serializer] = None
 
@@ -114,7 +115,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
 
   override val partitioner: Some[Partitioner] = Some(part)
 
-  override def compute(s: Partition, context: TaskContext): Iterator[(K, CoGroupCombiner)] = {
+  override def compute(s: Partition, context: TaskContext): Iterator[(K, Array[Iterable[_]])] = {
     val sparkConf = SparkEnv.get.conf
     val externalSorting = sparkConf.getBoolean("spark.shuffle.spill", true)
     val split = s.asInstanceOf[CoGroupPartition]
@@ -150,18 +151,17 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
           getCombiner(kv._1)(depNum) += kv._2
         }
       }
-      new InterruptibleIterator(context, map.iterator)
+      new InterruptibleIterator(context,
+        map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
     } else {
       val map = createExternalMap(numRdds)
-      rddIterators.foreach { case (it, depNum) =>
-        while (it.hasNext) {
-          val kv = it.next()
-          map.insert(kv._1, new CoGroupValue(kv._2, depNum))
-        }
+      for ((it, depNum) <- rddIterators) {
+        map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
       }
-      context.taskMetrics.memoryBytesSpilled = map.memoryBytesSpilled
-      context.taskMetrics.diskBytesSpilled = map.diskBytesSpilled
-      new InterruptibleIterator(context, map.iterator)
+      context.taskMetrics.memoryBytesSpilled += map.memoryBytesSpilled
+      context.taskMetrics.diskBytesSpilled += map.diskBytesSpilled
+      new InterruptibleIterator(context,
+        map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
     }
   }
 
@@ -170,17 +170,22 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
 
     val createCombiner: (CoGroupValue => CoGroupCombiner) = value => {
       val newCombiner = Array.fill(numRdds)(new CoGroup)
-      value match { case (v, depNum) => newCombiner(depNum) += v }
+      newCombiner(value._2) += value._1
       newCombiner
     }
     val mergeValue: (CoGroupCombiner, CoGroupValue) => CoGroupCombiner =
       (combiner, value) => {
-      value match { case (v, depNum) => combiner(depNum) += v }
+      combiner(value._2) += value._1
       combiner
     }
     val mergeCombiners: (CoGroupCombiner, CoGroupCombiner) => CoGroupCombiner =
       (combiner1, combiner2) => {
-        combiner1.zip(combiner2).map { case (v1, v2) => v1 ++ v2 }
+        var depNum = 0
+        while (depNum < numRdds) {
+          combiner1(depNum) ++= combiner2(depNum)
+          depNum += 1
+        }
+        combiner1
       }
     new ExternalAppendOnlyMap[K, CoGroupValue, CoGroupCombiner](
       createCombiner, mergeValue, mergeCombiners)
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index c45b759f007cc..11ebafbf6d457 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -49,8 +49,8 @@ private[spark] case class CoalescedRDDPartition(
   }
 
   /**
-   * Computes how many of the parents partitions have getPreferredLocation
-   * as one of their preferredLocations
+   * Computes the fraction of the parents' partitions containing preferredLocation within
+   * their getPreferredLocs.
    * @return locality of this coalesced partition between 0 and 1
    */
   def localFraction: Double = {
@@ -258,7 +258,7 @@ private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanc
         val pgroup = PartitionGroup(nxt_replica)
         groupArr += pgroup
         addPartToPGroup(nxt_part, pgroup)
-        groupHash += (nxt_replica -> (ArrayBuffer(pgroup))) // list in case we have multiple
+        groupHash.put(nxt_replica, ArrayBuffer(pgroup)) // list in case we have multiple
         numCreated += 1
       }
     }
@@ -267,7 +267,7 @@ private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanc
       var (nxt_replica, nxt_part) = rotIt.next()
       val pgroup = PartitionGroup(nxt_replica)
       groupArr += pgroup
-      groupHash.get(nxt_replica).get += pgroup
+      groupHash.getOrElseUpdate(nxt_replica, ArrayBuffer()) += pgroup
       var tries = 0
       while (!addPartToPGroup(nxt_part, pgroup) && tries < targetLen) { // ensure at least one part
         nxt_part = rotIt.next()._2
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 041028514399b..8d92ea01d9a3f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -20,7 +20,9 @@ package org.apache.spark.rdd
 import java.text.SimpleDateFormat
 import java.util.Date
 import java.io.EOFException
+
 import scala.collection.immutable.Map
+import scala.reflect.ClassTag
 
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.mapred.FileSplit
@@ -39,6 +41,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.executor.{DataReadMethod, InputMetrics}
+import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.util.NextIterator
 
 /**
@@ -140,8 +143,8 @@ class HadoopRDD[K, V](
       // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the
       // local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
       // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
-      // synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456)
-      conf.synchronized {
+      // Synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456).
+      HadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
         val newJobConf = new JobConf(conf)
         initLocalJobConfFuncOpt.map(f => f(newJobConf))
         HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
@@ -232,6 +235,14 @@ class HadoopRDD[K, V](
     new InterruptibleIterator[(K, V)](context, iter)
   }
 
+  /** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
+  @DeveloperApi
+  def mapPartitionsWithInputSplit[U: ClassTag](
+      f: (InputSplit, Iterator[(K, V)]) => Iterator[U],
+      preservesPartitioning: Boolean = false): RDD[U] = {
+    new HadoopMapPartitionsWithSplitRDD(this, f, preservesPartitioning)
+  }
+
   override def getPreferredLocations(split: Partition): Seq[String] = {
     // TODO: Filtering out "localhost" in case of file:// URLs
     val hadoopSplit = split.asInstanceOf[HadoopPartition]
@@ -246,6 +257,9 @@ class HadoopRDD[K, V](
 }
 
 private[spark] object HadoopRDD {
+  /** Constructing Configuration objects is not threadsafe, use this lock to serialize. */
+  val CONFIGURATION_INSTANTIATION_LOCK = new Object()
+
   /**
    * The three methods below are helpers for accessing the local map, a property of the SparkEnv of
    * the local process.
@@ -269,4 +283,25 @@ private[spark] object HadoopRDD {
     conf.setInt("mapred.task.partition", splitId)
     conf.set("mapred.job.id", jobID.toString)
   }
+
+  /**
+   * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
+   * the given function rather than the index of the partition.
+   */
+  private[spark] class HadoopMapPartitionsWithSplitRDD[U: ClassTag, T: ClassTag](
+      prev: RDD[T],
+      f: (InputSplit, Iterator[T]) => Iterator[U],
+      preservesPartitioning: Boolean = false)
+    extends RDD[U](prev) {
+
+    override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None
+
+    override def getPartitions: Array[Partition] = firstParent[T].partitions
+
+    override def compute(split: Partition, context: TaskContext) = {
+      val partition = split.asInstanceOf[HadoopPartition]
+      val inputSplit = partition.inputSplit.value
+      f(inputSplit, firstParent[T].iterator(split, context))
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/MappedValuesRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MappedValuesRDD.scala
index 2bc47eb9fcd74..a60952eee5901 100644
--- a/core/src/main/scala/org/apache/spark/rdd/MappedValuesRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/MappedValuesRDD.scala
@@ -28,6 +28,6 @@ class MappedValuesRDD[K, V, U](prev: RDD[_ <: Product2[K, V]], f: V => U)
   override val partitioner = firstParent[Product2[K, U]].partitioner
 
   override def compute(split: Partition, context: TaskContext): Iterator[(K, U)] = {
-    firstParent[Product2[K, V]].iterator(split, context).map { case Product2(k ,v) => (k, f(v)) }
+    firstParent[Product2[K, V]].iterator(split, context).map { pair => (pair._1, f(pair._2)) }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index f2b3a64bf1345..7dfec9a18ec67 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -20,6 +20,8 @@ package org.apache.spark.rdd
 import java.text.SimpleDateFormat
 import java.util.Date
 
+import scala.reflect.ClassTag
+
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
@@ -32,6 +34,7 @@ import org.apache.spark.Partition
 import org.apache.spark.SerializableWritable
 import org.apache.spark.{SparkContext, TaskContext}
 import org.apache.spark.executor.{DataReadMethod, InputMetrics}
+import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
 
 private[spark] class NewHadoopPartition(
     rddId: Int,
@@ -157,6 +160,14 @@ class NewHadoopRDD[K, V](
     new InterruptibleIterator(context, iter)
   }
 
+  /** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
+  @DeveloperApi
+  def mapPartitionsWithInputSplit[U: ClassTag](
+      f: (InputSplit, Iterator[(K, V)]) => Iterator[U],
+      preservesPartitioning: Boolean = false): RDD[U] = {
+    new NewHadoopMapPartitionsWithSplitRDD(this, f, preservesPartitioning)
+  }
+
   override def getPreferredLocations(split: Partition): Seq[String] = {
     val theSplit = split.asInstanceOf[NewHadoopPartition]
     theSplit.serializableHadoopSplit.value.getLocations.filter(_ != "localhost")
@@ -165,6 +176,29 @@ class NewHadoopRDD[K, V](
   def getConf: Configuration = confBroadcast.value.value
 }
 
+private[spark] object NewHadoopRDD {
+  /**
+   * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
+   * the given function rather than the index of the partition.
+   */
+  private[spark] class NewHadoopMapPartitionsWithSplitRDD[U: ClassTag, T: ClassTag](
+      prev: RDD[T],
+      f: (InputSplit, Iterator[T]) => Iterator[U],
+      preservesPartitioning: Boolean = false)
+    extends RDD[U](prev) {
+
+    override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None
+
+    override def getPartitions: Array[Partition] = firstParent[T].partitions
+
+    override def compute(split: Partition, context: TaskContext) = {
+      val partition = split.asInstanceOf[NewHadoopPartition]
+      val inputSplit = partition.serializableHadoopSplit.value
+      f(inputSplit, firstParent[T].iterator(split, context))
+    }
+  }
+}
+
 private[spark] class WholeTextFileRDD(
     sc : SparkContext,
     inputFormatClass: Class[_ <: WholeTextFileInputFormat],
diff --git a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
index f1f4b4324edfd..e98bad2026e32 100644
--- a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
@@ -20,6 +20,7 @@ package org.apache.spark.rdd
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Logging, RangePartitioner}
+import org.apache.spark.annotation.DeveloperApi
 
 /**
  * Extra functions available on RDDs of (key, value) pairs where the key is sortable through
@@ -43,10 +44,10 @@ import org.apache.spark.{Logging, RangePartitioner}
  */
 class OrderedRDDFunctions[K : Ordering : ClassTag,
                           V: ClassTag,
-                          P <: Product2[K, V] : ClassTag](
+                          P <: Product2[K, V] : ClassTag] @DeveloperApi() (
     self: RDD[P])
-  extends Logging with Serializable {
-
+  extends Logging with Serializable
+{
   private val ordering = implicitly[Ordering[K]]
 
   /**
@@ -55,16 +56,12 @@ class OrderedRDDFunctions[K : Ordering : ClassTag,
    * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
    * order of the keys).
    */
-  def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.size): RDD[P] = {
+  // TODO: this currently doesn't work on P other than Tuple2!
+  def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.size)
+      : RDD[(K, V)] =
+  {
     val part = new RangePartitioner(numPartitions, self, ascending)
-    val shuffled = new ShuffledRDD[K, V, V, P](self, part).setKeyOrdering(ordering)
-    shuffled.mapPartitions(iter => {
-      val buf = iter.toArray
-      if (ascending) {
-        buf.sortWith((x, y) => ordering.lt(x._1, y._1)).iterator
-      } else {
-        buf.sortWith((x, y) => ordering.gt(x._1, y._1)).iterator
-      }
-    }, preservesPartitioning = true)
+    new ShuffledRDD[K, V, V](self, part)
+      .setKeyOrdering(if (ascending) ordering else ordering.reverse)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index fc9beb166befe..93af50c0a9cd1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -19,12 +19,10 @@ package org.apache.spark.rdd
 
 import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
-import java.util.Date
-import java.util.{HashMap => JHashMap}
+import java.util.{Date, HashMap => JHashMap}
 
+import scala.collection.{Map, mutable}
 import scala.collection.JavaConversions._
-import scala.collection.Map
-import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
@@ -34,18 +32,19 @@ import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
-import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat, Job => NewAPIHadoopJob,
+import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewOutputFormat,
 RecordWriter => NewRecordWriter, SparkHadoopMapReduceUtil}
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat}
 
 import org.apache.spark._
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.SparkHadoopWriter
 import org.apache.spark.Partitioner.defaultPartitioner
 import org.apache.spark.SparkContext._
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.serializer.Serializer
+import org.apache.spark.util.Utils
+import org.apache.spark.util.collection.CompactBuffer
+import org.apache.spark.util.random.StratifiedSamplingUtils
 
 /**
  * Extra functions available on RDDs of (key, value) pairs through an implicit conversion.
@@ -91,7 +90,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
         new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
       }, preservesPartitioning = true)
     } else {
-      new ShuffledRDD[K, V, C, (K, C)](self, partitioner)
+      new ShuffledRDD[K, V, C](self, partitioner)
         .setSerializer(serializer)
         .setAggregator(aggregator)
         .setMapSideCombine(mapSideCombine)
@@ -125,7 +124,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     zeroBuffer.get(zeroArray)
 
     lazy val cachedSerializer = SparkEnv.get.closureSerializer.newInstance()
-    def createZero() = cachedSerializer.deserialize[U](ByteBuffer.wrap(zeroArray))
+    val createZero = () => cachedSerializer.deserialize[U](ByteBuffer.wrap(zeroArray))
 
     combineByKey[U]((v: V) => seqOp(createZero(), v), seqOp, combOp, partitioner)
   }
@@ -171,7 +170,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
     // When deserializing, use a lazy val to create just one instance of the serializer per task
     lazy val cachedSerializer = SparkEnv.get.closureSerializer.newInstance()
-    def createZero() = cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
+    val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
 
     combineByKey[V]((v: V) => func(createZero(), v), func, func, partitioner)
   }
@@ -194,6 +193,41 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     foldByKey(zeroValue, defaultPartitioner(self))(func)
   }
 
+  /**
+   * Return a subset of this RDD sampled by key (via stratified sampling).
+   *
+   * Create a sample of this RDD using variable sampling rates for different keys as specified by
+   * `fractions`, a key to sampling rate map.
+   *
+   * If `exact` is set to false, create the sample via simple random sampling, with one pass
+   * over the RDD, to produce a sample of size that's approximately equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values; otherwise, use
+   * additional passes over the RDD to create a sample size that's exactly equal to the sum of
+   * math.ceil(numItems * samplingRate) over all key values with a 99.99% confidence. When sampling
+   * without replacement, we need one additional pass over the RDD to guarantee sample size;
+   * when sampling with replacement, we need two additional passes.
+   *
+   * @param withReplacement whether to sample with or without replacement
+   * @param fractions map of specific keys to sampling rates
+   * @param seed seed for the random number generator
+   * @param exact whether sample size needs to be exactly math.ceil(fraction * size) per key
+   * @return RDD containing the sampled subset
+   */
+  def sampleByKey(withReplacement: Boolean,
+      fractions: Map[K, Double],
+      exact: Boolean = false,
+      seed: Long = Utils.random.nextLong): RDD[(K, V)]= {
+
+    require(fractions.values.forall(v => v >= 0.0), "Negative sampling rates.")
+
+    val samplingFunc = if (withReplacement) {
+      StratifiedSamplingUtils.getPoissonSamplingFunction(self, fractions, exact, seed)
+    } else {
+      StratifiedSamplingUtils.getBernoulliSamplingFunction(self, fractions, exact, seed)
+    }
+    self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true)
+  }
+
   /**
    * Merge the values for each key using an associative reduce function. This will also perform
    * the merging locally on each mapper before sending results to a reducer, similarly to a
@@ -214,22 +248,22 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       throw new SparkException("reduceByKeyLocally() does not support array keys")
     }
 
-    def reducePartition(iter: Iterator[(K, V)]): Iterator[JHashMap[K, V]] = {
+    val reducePartition = (iter: Iterator[(K, V)]) => {
       val map = new JHashMap[K, V]
-      iter.foreach { case (k, v) =>
-        val old = map.get(k)
-        map.put(k, if (old == null) v else func(old, v))
+      iter.foreach { pair =>
+        val old = map.get(pair._1)
+        map.put(pair._1, if (old == null) pair._2 else func(old, pair._2))
       }
       Iterator(map)
-    }
+    } : Iterator[JHashMap[K, V]]
 
-    def mergeMaps(m1: JHashMap[K, V], m2: JHashMap[K, V]): JHashMap[K, V] = {
-      m2.foreach { case (k, v) =>
-        val old = m1.get(k)
-        m1.put(k, if (old == null) v else func(old, v))
+    val mergeMaps = (m1: JHashMap[K, V], m2: JHashMap[K, V]) => {
+      m2.foreach { pair =>
+        val old = m1.get(pair._1)
+        m1.put(pair._1, if (old == null) pair._2 else func(old, pair._2))
       }
       m1
-    }
+    } : JHashMap[K, V]
 
     self.mapPartitions(reducePartition).reduce(mergeMaps)
   }
@@ -353,29 +387,29 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Allows controlling the
    * partitioning of the resulting key-value pair RDD by passing a Partitioner.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[PairRDDFunctions.reduceByKey]] or [[PairRDDFunctions.combineByKey]]
-   * will provide much better performance.
+   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
+   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
   def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] = {
     // groupByKey shouldn't use map side combine because map side combine does not
     // reduce the amount of data shuffled and requires all map side data be inserted
     // into a hash table, leading to more objects in the old gen.
-    def createCombiner(v: V) = ArrayBuffer(v)
-    def mergeValue(buf: ArrayBuffer[V], v: V) = buf += v
-    def mergeCombiners(c1: ArrayBuffer[V], c2: ArrayBuffer[V]) = c1 ++ c2
-    val bufs = combineByKey[ArrayBuffer[V]](
-      createCombiner _, mergeValue _, mergeCombiners _, partitioner, mapSideCombine=false)
-    bufs.mapValues(_.toIterable)
+    val createCombiner = (v: V) => CompactBuffer(v)
+    val mergeValue = (buf: CompactBuffer[V], v: V) => buf += v
+    val mergeCombiners = (c1: CompactBuffer[V], c2: CompactBuffer[V]) => c1 ++= c2
+    val bufs = combineByKey[CompactBuffer[V]](
+      createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine=false)
+    bufs.asInstanceOf[RDD[(K, Iterable[V])]]
   }
 
   /**
    * Group the values for each key in the RDD into a single sequence. Hash-partitions the
    * resulting RDD with into `numPartitions` partitions.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[PairRDDFunctions.reduceByKey]] or [[PairRDDFunctions.combineByKey]]
-   * will provide much better performance.
+   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
+   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
   def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])] = {
     groupByKey(new HashPartitioner(numPartitions))
@@ -391,7 +425,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     if (self.partitioner == Some(partitioner)) {
       self
     } else {
-      new ShuffledRDD[K, V, V, (K, V)](self, partitioner)
+      new ShuffledRDD[K, V, V](self, partitioner)
     }
   }
 
@@ -401,9 +435,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * (k, v2) is in `other`. Uses the given Partitioner to partition the output RDD.
    */
   def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))] = {
-    this.cogroup(other, partitioner).flatMapValues { case (vs, ws) =>
-      for (v <- vs; w <- ws) yield (v, w)
-    }
+    this.cogroup(other, partitioner).flatMapValues( pair =>
+      for (v <- pair._1; w <- pair._2) yield (v, w)
+    )
   }
 
   /**
@@ -413,11 +447,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * partition the output RDD.
    */
   def leftOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))] = {
-    this.cogroup(other, partitioner).flatMapValues { case (vs, ws) =>
-      if (ws.isEmpty) {
-        vs.map(v => (v, None))
+    this.cogroup(other, partitioner).flatMapValues { pair =>
+      if (pair._2.isEmpty) {
+        pair._1.map(v => (v, None))
       } else {
-        for (v <- vs; w <- ws) yield (v, Some(w))
+        for (v <- pair._1; w <- pair._2) yield (v, Some(w))
       }
     }
   }
@@ -430,11 +464,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    */
   def rightOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner)
       : RDD[(K, (Option[V], W))] = {
-    this.cogroup(other, partitioner).flatMapValues { case (vs, ws) =>
-      if (vs.isEmpty) {
-        ws.map(w => (None, w))
+    this.cogroup(other, partitioner).flatMapValues { pair =>
+      if (pair._1.isEmpty) {
+        pair._2.map(w => (None, w))
       } else {
-        for (v <- vs; w <- ws) yield (Some(v), w)
+        for (v <- pair._1; w <- pair._2) yield (Some(v), w)
       }
     }
   }
@@ -462,9 +496,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Hash-partitions the
    * resulting RDD with the existing partitioner/parallelism level.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[PairRDDFunctions.reduceByKey]] or [[PairRDDFunctions.combineByKey]]
-   * will provide much better performance,
+   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
+   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
   def groupByKey(): RDD[(K, Iterable[V])] = {
     groupByKey(defaultPartitioner(self))
@@ -530,12 +564,15 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
   /**
    * Return the key-value pairs in this RDD to the master as a Map.
+   *
+   * Warning: this doesn't return a multimap (so if you have multiple values to the same key, only
+   *          one value per key is preserved in the map returned)
    */
   def collectAsMap(): Map[K, V] = {
     val data = self.collect()
     val map = new mutable.HashMap[K, V]
     map.sizeHint(data.length)
-    data.foreach { case (k, v) => map.put(k, v) }
+    data.foreach { pair => map.put(pair._1, pair._2) }
     map
   }
 
@@ -571,11 +608,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       throw new SparkException("Default partitioner cannot partition array keys.")
     }
     val cg = new CoGroupedRDD[K](Seq(self, other1, other2, other3), partitioner)
-    cg.mapValues { case Seq(vs, w1s, w2s, w3s) =>
-      (vs.asInstanceOf[Seq[V]],
-        w1s.asInstanceOf[Seq[W1]],
-        w2s.asInstanceOf[Seq[W2]],
-        w3s.asInstanceOf[Seq[W3]])
+    cg.mapValues { case Array(vs, w1s, w2s, w3s) =>
+       (vs.asInstanceOf[Iterable[V]],
+         w1s.asInstanceOf[Iterable[W1]],
+         w2s.asInstanceOf[Iterable[W2]],
+         w3s.asInstanceOf[Iterable[W3]])
     }
   }
 
@@ -589,8 +626,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       throw new SparkException("Default partitioner cannot partition array keys.")
     }
     val cg = new CoGroupedRDD[K](Seq(self, other), partitioner)
-    cg.mapValues { case Seq(vs, ws) =>
-      (vs.asInstanceOf[Seq[V]], ws.asInstanceOf[Seq[W]])
+    cg.mapValues { case Array(vs, w1s) =>
+      (vs.asInstanceOf[Iterable[V]], w1s.asInstanceOf[Iterable[W]])
     }
   }
 
@@ -604,10 +641,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       throw new SparkException("Default partitioner cannot partition array keys.")
     }
     val cg = new CoGroupedRDD[K](Seq(self, other1, other2), partitioner)
-    cg.mapValues { case Seq(vs, w1s, w2s) =>
-      (vs.asInstanceOf[Seq[V]],
-       w1s.asInstanceOf[Seq[W1]],
-       w2s.asInstanceOf[Seq[W2]])
+    cg.mapValues { case Array(vs, w1s, w2s) =>
+      (vs.asInstanceOf[Iterable[V]],
+        w1s.asInstanceOf[Iterable[W1]],
+        w2s.asInstanceOf[Iterable[W2]])
     }
   }
 
@@ -710,14 +747,14 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     self.partitioner match {
       case Some(p) =>
         val index = p.getPartition(key)
-        def process(it: Iterator[(K, V)]): Seq[V] = {
+        val process = (it: Iterator[(K, V)]) => {
           val buf = new ArrayBuffer[V]
-          for ((k, v) <- it if k == key) {
-            buf += v
+          for (pair <- it if pair._1 == key) {
+            buf += pair._2
           }
           buf
-        }
-        val res = self.context.runJob(self, process _, Array(index), false)
+        } : Seq[V]
+        val res = self.context.runJob(self, process, Array(index), false)
         res(0)
       case None =>
         self.filter(_._1 == key).map(_._2).collect()
@@ -840,7 +877,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       jobFormat.checkOutputSpecs(job)
     }
 
-    def writeShard(context: TaskContext, iter: Iterator[(K,V)]): Int = {
+    val writeShard = (context: TaskContext, iter: Iterator[(K,V)]) => {
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
       val attemptNumber = (context.attemptId % Int.MaxValue).toInt
@@ -858,22 +895,21 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
       try {
         while (iter.hasNext) {
-          val (k, v) = iter.next()
-          writer.write(k, v)
+          val pair = iter.next()
+          writer.write(pair._1, pair._2)
         }
-      }
-      finally {
+      } finally {
         writer.close(hadoopContext)
       }
       committer.commitTask(hadoopContext)
-      return 1
-    }
+      1
+    } : Int
 
     val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true, 0, 0)
     val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
     val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
     jobCommitter.setupJob(jobTaskContext)
-    self.context.runJob(self, writeShard _)
+    self.context.runJob(self, writeShard)
     jobCommitter.commitJob(jobTaskContext)
   }
 
@@ -912,7 +948,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val writer = new SparkHadoopWriter(hadoopConf)
     writer.preSetup()
 
-    def writeToFile(context: TaskContext, iter: Iterator[(K, V)]) {
+    val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
       val attemptNumber = (context.attemptId % Int.MaxValue).toInt
@@ -921,19 +957,18 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       writer.open()
       try {
         var count = 0
-        while(iter.hasNext) {
+        while (iter.hasNext) {
           val record = iter.next()
           count += 1
           writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
         }
-      }
-      finally {
+      } finally {
         writer.close()
       }
       writer.commit()
     }
 
-    self.context.runJob(self, writeToFile _)
+    self.context.runJob(self, writeToFile)
     writer.commitJob()
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
index b5b8a5706deb3..a637d6f15b7e5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -39,6 +39,7 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  *
  * @param prev RDD to be sampled
  * @param sampler a random sampler
+ * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD
  * @param seed random seed
  * @tparam T input RDD item type
  * @tparam U sampled RDD item type
@@ -46,9 +47,12 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
 private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
     prev: RDD[T],
     sampler: RandomSampler[T, U],
+    @transient preservesPartitioning: Boolean,
     @transient seed: Long = Utils.random.nextLong)
   extends RDD[U](prev) {
 
+  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None
+
   override def getPartitions: Array[Partition] = {
     val random = new Random(seed)
     firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 4e841bc992bff..e1c49e35abecd 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -35,12 +35,13 @@ import org.apache.spark.Partitioner._
 import org.apache.spark.SparkContext._
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.partial.BoundedDouble
 import org.apache.spark.partial.CountEvaluator
 import org.apache.spark.partial.GroupedCountEvaluator
 import org.apache.spark.partial.PartialResult
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.{BoundedPriorityQueue, CallSite, Utils}
+import org.apache.spark.util.{BoundedPriorityQueue, Utils}
 import org.apache.spark.util.collection.OpenHashMap
 import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, SamplingUtils}
 
@@ -328,19 +329,19 @@ abstract class RDD[T: ClassTag](
       : RDD[T] = {
     if (shuffle) {
       /** Distributes elements evenly across output partitions, starting from a random partition. */
-      def distributePartition(index: Int, items: Iterator[T]): Iterator[(Int, T)] = {
+      val distributePartition = (index: Int, items: Iterator[T]) => {
         var position = (new Random(index)).nextInt(numPartitions)
         items.map { t =>
-          // Note that the hash code of the key will just be the key itself. The HashPartitioner 
+          // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
           position = position + 1
           (position, t)
         }
-      }
+      } : Iterator[(Int, T)]
 
       // include a shuffle step so that our upstream tasks are still distributed
       new CoalescedRDD(
-        new ShuffledRDD[Int, T, T, (Int, T)](mapPartitionsWithIndex(distributePartition),
+        new ShuffledRDD[Int, T, T](mapPartitionsWithIndex(distributePartition),
         new HashPartitioner(numPartitions)),
         numPartitions).values
     } else {
@@ -351,14 +352,14 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a sampled subset of this RDD.
    */
-  def sample(withReplacement: Boolean, 
-      fraction: Double, 
+  def sample(withReplacement: Boolean,
+      fraction: Double,
       seed: Long = Utils.random.nextLong): RDD[T] = {
-    require(fraction >= 0.0, "Invalid fraction value: " + fraction)
+    require(fraction >= 0.0, "Negative fraction value: " + fraction)
     if (withReplacement) {
-      new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), seed)
+      new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), true, seed)
     } else {
-      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), seed)
+      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), true, seed)
     }
   }
 
@@ -374,7 +375,7 @@ abstract class RDD[T: ClassTag](
     val sum = weights.sum
     val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
     normalizedCumWeights.sliding(2).map { x =>
-      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](x(0), x(1)), seed)
+      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](x(0), x(1)), true, seed)
     }.toArray
   }
 
@@ -509,6 +510,10 @@ abstract class RDD[T: ClassTag](
   /**
    * Return an RDD of grouped items. Each group consists of a key and a sequence of elements
    * mapping to that key.
+   *
+   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
+   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
   def groupBy[K](f: T => K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] =
     groupBy[K](f, defaultPartitioner(this))
@@ -516,6 +521,10 @@ abstract class RDD[T: ClassTag](
   /**
    * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements
    * mapping to that key.
+   *
+   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
+   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
   def groupBy[K](f: T => K, numPartitions: Int)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] =
     groupBy(f, new HashPartitioner(numPartitions))
@@ -523,6 +532,10 @@ abstract class RDD[T: ClassTag](
   /**
    * Return an RDD of grouped items. Each group consists of a key and a sequence of elements
    * mapping to that key.
+   *
+   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
+   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
   def groupBy[K](f: T => K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K] = null)
       : RDD[(K, Iterable[T])] = {
@@ -574,6 +587,9 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Return a new RDD by applying a function to each partition of this RDD.
+   *
+   * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
+   * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
    */
   def mapPartitions[U: ClassTag](
       f: Iterator[T] => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = {
@@ -584,6 +600,9 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a new RDD by applying a function to each partition of this RDD, while tracking the index
    * of the original partition.
+   *
+   * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
+   * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
    */
   def mapPartitionsWithIndex[U: ClassTag](
       f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = {
@@ -595,6 +614,9 @@ abstract class RDD[T: ClassTag](
    * :: DeveloperApi ::
    * Return a new RDD by applying a function to each partition of this RDD. This is a variant of
    * mapPartitions that also passes the TaskContext into the closure.
+   *
+   * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
+   * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
    */
   @DeveloperApi
   def mapPartitionsWithContext[U: ClassTag](
@@ -677,7 +699,7 @@ abstract class RDD[T: ClassTag](
    * a map on the other).
    */
   def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)] = {
-    zipPartitions(other, true) { (thisIter, otherIter) =>
+    zipPartitions(other, preservesPartitioning = false) { (thisIter, otherIter) =>
       new Iterator[(T, U)] {
         def hasNext = (thisIter.hasNext, otherIter.hasNext) match {
           case (true, true) => true
@@ -733,14 +755,16 @@ abstract class RDD[T: ClassTag](
    * Applies a function f to all elements of this RDD.
    */
   def foreach(f: T => Unit) {
-    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(f))
+    val cleanF = sc.clean(f)
+    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
   }
 
   /**
    * Applies a function f to each partition of this RDD.
    */
   def foreachPartition(f: Iterator[T] => Unit) {
-    sc.runJob(this, (iter: Iterator[T]) => f(iter))
+    val cleanF = sc.clean(f)
+    sc.runJob(this, (iter: Iterator[T]) => cleanF(iter))
   }
 
   /**
@@ -907,19 +931,19 @@ abstract class RDD[T: ClassTag](
       throw new SparkException("countByValue() does not support arrays")
     }
     // TODO: This should perhaps be distributed by default.
-    def countPartition(iter: Iterator[T]): Iterator[OpenHashMap[T,Long]] = {
+    val countPartition = (iter: Iterator[T]) => {
       val map = new OpenHashMap[T,Long]
       iter.foreach {
         t => map.changeValue(t, 1L, _ + 1L)
       }
       Iterator(map)
-    }
-    def mergeMaps(m1: OpenHashMap[T,Long], m2: OpenHashMap[T,Long]): OpenHashMap[T,Long] = {
+    }: Iterator[OpenHashMap[T,Long]]
+    val mergeMaps = (m1: OpenHashMap[T,Long], m2: OpenHashMap[T,Long]) => {
       m2.foreach { case (key, value) =>
         m1.changeValue(key, value, _ + value)
       }
       m1
-    }
+    }: OpenHashMap[T,Long]
     val myResult = mapPartitions(countPartition).reduce(mergeMaps)
     // Convert to a Scala mutable map
     val mutableResult = scala.collection.mutable.Map[T,Long]()
@@ -1183,16 +1207,12 @@ abstract class RDD[T: ClassTag](
   /**
    * Return whether this RDD has been checkpointed or not
    */
-  def isCheckpointed: Boolean = {
-    checkpointData.map(_.isCheckpointed).getOrElse(false)
-  }
+  def isCheckpointed: Boolean = checkpointData.exists(_.isCheckpointed)
 
   /**
    * Gets the name of the file to which this RDD was checkpointed
    */
-  def getCheckpointFile: Option[String] = {
-    checkpointData.flatMap(_.getCheckpointFile)
-  }
+  def getCheckpointFile: Option[String] = checkpointData.flatMap(_.getCheckpointFile)
 
   // =======================================================================
   // Other internal methods and fields
@@ -1202,7 +1222,7 @@ abstract class RDD[T: ClassTag](
 
   /** User code that created this RDD (e.g. `textFile`, `parallelize`). */
   @transient private[spark] val creationSite = Utils.getCallSite
-  private[spark] def getCreationSite: String = Option(creationSite).map(_.short).getOrElse("")
+  private[spark] def getCreationSite: String = Option(creationSite).map(_.shortForm).getOrElse("")
 
   private[spark] def elementClassTag: ClassTag[T] = classTag[T]
 
@@ -1216,6 +1236,23 @@ abstract class RDD[T: ClassTag](
   /** The [[org.apache.spark.SparkContext]] that this RDD was created on. */
   def context = sc
 
+  /**
+   * Private API for changing an RDD's ClassTag.
+   * Used for internal Java <-> Scala API compatibility.
+   */
+  private[spark] def retag(cls: Class[T]): RDD[T] = {
+    val classTag: ClassTag[T] = ClassTag.apply(cls)
+    this.retag(classTag)
+  }
+
+  /**
+   * Private API for changing an RDD's ClassTag.
+   * Used for internal Java <-> Scala API compatibility.
+   */
+  private[spark] def retag(implicit classTag: ClassTag[T]): RDD[T] = {
+    this.mapPartitions(identity, preservesPartitioning = true)(classTag)
+  }
+
   // Avoid handling doCheckpoint multiple times to prevent excessive recursion
   @transient private var doCheckpointCalled = false
 
@@ -1257,11 +1294,55 @@ abstract class RDD[T: ClassTag](
 
   /** A description of this RDD and its recursive dependencies for debugging. */
   def toDebugString: String = {
-    def debugString(rdd: RDD[_], prefix: String = ""): Seq[String] = {
-      Seq(prefix + rdd + " (" + rdd.partitions.size + " partitions)") ++
-        rdd.dependencies.flatMap(d => debugString(d.rdd, prefix + "  "))
+    // Apply a different rule to the last child
+    def debugChildren(rdd: RDD[_], prefix: String): Seq[String] = {
+      val len = rdd.dependencies.length
+      len match {
+        case 0 => Seq.empty
+        case 1 =>
+          val d = rdd.dependencies.head
+          debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]], true)
+        case _ =>
+          val frontDeps = rdd.dependencies.take(len - 1)
+          val frontDepStrings = frontDeps.flatMap(
+            d => debugString(d.rdd, prefix, d.isInstanceOf[ShuffleDependency[_,_,_]]))
+
+          val lastDep = rdd.dependencies.last
+          val lastDepStrings =
+            debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_,_,_]], true)
+
+          (frontDepStrings ++ lastDepStrings)
+      }
+    }
+    // The first RDD in the dependency stack has no parents, so no need for a +-
+    def firstDebugString(rdd: RDD[_]): Seq[String] = {
+      val partitionStr = "(" + rdd.partitions.size + ")"
+      val leftOffset = (partitionStr.length - 1) / 2
+      val nextPrefix = (" " * leftOffset) + "|" + (" " * (partitionStr.length - leftOffset))
+      Seq(partitionStr + " " + rdd) ++ debugChildren(rdd, nextPrefix)
+    }
+    def shuffleDebugString(rdd: RDD[_], prefix: String = "", isLastChild: Boolean): Seq[String] = {
+      val partitionStr = "(" + rdd.partitions.size + ")"
+      val leftOffset = (partitionStr.length - 1) / 2
+      val thisPrefix = prefix.replaceAll("\\|\\s+$", "")
+      val nextPrefix = (
+        thisPrefix
+        + (if (isLastChild) "  " else "| ")
+        + (" " * leftOffset) + "|" + (" " * (partitionStr.length - leftOffset)))
+      Seq(thisPrefix + "+-" + partitionStr + " " + rdd) ++ debugChildren(rdd, nextPrefix)
+    }
+    def debugString(rdd: RDD[_],
+                    prefix: String = "",
+                    isShuffle: Boolean = true,
+                    isLastChild: Boolean = false): Seq[String] = {
+      if (isShuffle) {
+        shuffleDebugString(rdd, prefix, isLastChild)
+      }
+      else {
+        Seq(prefix + rdd) ++ debugChildren(rdd, prefix)
+      }
     }
-    debugString(this).mkString("\n")
+    firstDebugString(this).mkString("\n")
   }
 
   override def toString: String = "%s%s[%d] at %s".format(
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index c3b2a33fb54d0..f67e5f1857979 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -106,7 +106,6 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
       cpRDD = Some(newRDD)
       rdd.markCheckpointed(newRDD)   // Update the RDD's dependencies and partitions
       cpState = Checkpointed
-      RDDCheckpointData.clearTaskCaches()
     }
     logInfo("Done checkpointing RDD " + rdd.id + " to " + path + ", new parent is RDD " + newRDD.id)
   }
@@ -131,9 +130,5 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
   }
 }
 
-private[spark] object RDDCheckpointData {
-  def clearTaskCaches() {
-    ShuffleMapTask.clearCache()
-    ResultTask.clearCache()
-  }
-}
+// Used for synchronization
+private[spark] object RDDCheckpointData
diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
index bf02f68d0d3d3..d9fe6847254fa 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
@@ -37,11 +37,12 @@ private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  * @tparam V the value class.
  * @tparam C the combiner class.
  */
+// TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs
 @DeveloperApi
-class ShuffledRDD[K, V, C, P <: Product2[K, C] : ClassTag](
+class ShuffledRDD[K, V, C](
     @transient var prev: RDD[_ <: Product2[K, V]],
     part: Partitioner)
-  extends RDD[P](prev.context, Nil) {
+  extends RDD[(K, C)](prev.context, Nil) {
 
   private var serializer: Option[Serializer] = None
 
@@ -52,25 +53,25 @@ class ShuffledRDD[K, V, C, P <: Product2[K, C] : ClassTag](
   private var mapSideCombine: Boolean = false
 
   /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */
-  def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C, P] = {
+  def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {
     this.serializer = Option(serializer)
     this
   }
 
   /** Set key ordering for RDD's shuffle. */
-  def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C, P] = {
+  def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {
     this.keyOrdering = Option(keyOrdering)
     this
   }
 
   /** Set aggregator for RDD's shuffle. */
-  def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C, P] = {
+  def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {
     this.aggregator = Option(aggregator)
     this
   }
 
   /** Set mapSideCombine flag for RDD's shuffle. */
-  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C, P] = {
+  def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
     this.mapSideCombine = mapSideCombine
     this
   }
@@ -85,11 +86,11 @@ class ShuffledRDD[K, V, C, P <: Product2[K, C] : ClassTag](
     Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
   }
 
-  override def compute(split: Partition, context: TaskContext): Iterator[P] = {
+  override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
     val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
     SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
       .read()
-      .asInstanceOf[Iterator[P]]
+      .asInstanceOf[Iterator[(K, C)]]
   }
 
   override def clearDependencies() {
diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
index 21c6e07d69f90..197167ecad0bd 100644
--- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
@@ -25,21 +25,32 @@ import scala.reflect.ClassTag
 import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
 
-private[spark] class UnionPartition[T: ClassTag](idx: Int, rdd: RDD[T], splitIndex: Int)
+/**
+ * Partition for UnionRDD.
+ *
+ * @param idx index of the partition
+ * @param rdd the parent RDD this partition refers to
+ * @param parentRddIndex index of the parent RDD this partition refers to
+ * @param parentRddPartitionIndex index of the partition within the parent RDD
+ *                                this partition refers to
+ */
+private[spark] class UnionPartition[T: ClassTag](
+    idx: Int,
+    @transient rdd: RDD[T],
+    val parentRddIndex: Int,
+    @transient parentRddPartitionIndex: Int)
   extends Partition {
 
-  var split: Partition = rdd.partitions(splitIndex)
-
-  def iterator(context: TaskContext) = rdd.iterator(split, context)
+  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)
 
-  def preferredLocations() = rdd.preferredLocations(split)
+  def preferredLocations() = rdd.preferredLocations(parentPartition)
 
   override val index: Int = idx
 
   @throws(classOf[IOException])
   private def writeObject(oos: ObjectOutputStream) {
     // Update the reference to parent split at the time of task serialization
-    split = rdd.partitions(splitIndex)
+    parentPartition = rdd.partitions(parentRddPartitionIndex)
     oos.defaultWriteObject()
   }
 }
@@ -47,14 +58,14 @@ private[spark] class UnionPartition[T: ClassTag](idx: Int, rdd: RDD[T], splitInd
 @DeveloperApi
 class UnionRDD[T: ClassTag](
     sc: SparkContext,
-    @transient var rdds: Seq[RDD[T]])
+    var rdds: Seq[RDD[T]])
   extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies
 
   override def getPartitions: Array[Partition] = {
     val array = new Array[Partition](rdds.map(_.partitions.size).sum)
     var pos = 0
-    for (rdd <- rdds; split <- rdd.partitions) {
-      array(pos) = new UnionPartition(pos, rdd, split.index)
+    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
+      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
       pos += 1
     }
     array
@@ -70,9 +81,17 @@ class UnionRDD[T: ClassTag](
     deps
   }
 
-  override def compute(s: Partition, context: TaskContext): Iterator[T] =
-    s.asInstanceOf[UnionPartition[T]].iterator(context)
+  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
+    val part = s.asInstanceOf[UnionPartition[T]]
+    val parentRdd = dependencies(part.parentRddIndex).rdd.asInstanceOf[RDD[T]]
+    parentRdd.iterator(part.parentPartition, context)
+  }
 
   override def getPreferredLocations(s: Partition): Seq[String] =
     s.asInstanceOf[UnionPartition[T]].preferredLocations()
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    rdds = null
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index e3a3f03528703..7123505ea66d9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -17,28 +17,30 @@
 
 package org.apache.spark.scheduler
 
-import java.io.{NotSerializableException, PrintWriter, StringWriter}
+import java.io.NotSerializableException
 import java.util.Properties
 import java.util.concurrent.atomic.AtomicInteger
 
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map, Stack}
 import scala.concurrent.Await
 import scala.concurrent.duration._
 import scala.language.postfixOps
 import scala.reflect.ClassTag
+import scala.util.control.NonFatal
 
 import akka.actor._
-import akka.actor.OneForOneStrategy
 import akka.actor.SupervisorStrategy.Stop
 import akka.pattern.ask
 import akka.util.Timeout
 
 import org.apache.spark._
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.{BlockId, BlockManager, BlockManagerMaster, RDDBlockId}
+import org.apache.spark.storage._
 import org.apache.spark.util.{CallSite, SystemClock, Clock, Utils}
+import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
 
 /**
  * The high-level scheduling layer that implements stage-oriented scheduling. It computes a DAG of
@@ -84,12 +86,9 @@ class DAGScheduler(
   private val nextStageId = new AtomicInteger(0)
 
   private[scheduler] val jobIdToStageIds = new HashMap[Int, HashSet[Int]]
-  private[scheduler] val stageIdToJobIds = new HashMap[Int, HashSet[Int]]
   private[scheduler] val stageIdToStage = new HashMap[Int, Stage]
   private[scheduler] val shuffleToMapStage = new HashMap[Int, Stage]
   private[scheduler] val jobIdToActiveJob = new HashMap[Int, ActiveJob]
-  private[scheduler] val resultStageToJob = new HashMap[Stage, ActiveJob]
-  private[scheduler] val stageToInfos = new HashMap[Stage, StageInfo]
 
   // Stages we need to run whose parents aren't done
   private[scheduler] val waitingStages = new HashSet[Stage]
@@ -100,9 +99,6 @@ class DAGScheduler(
   // Stages that must be resubmitted due to fetch failures
   private[scheduler] val failedStages = new HashSet[Stage]
 
-  // Missing tasks from each stage
-  private[scheduler] val pendingTasks = new HashMap[Stage, HashSet[Task[_]]]
-
   private[scheduler] val activeJobs = new HashSet[ActiveJob]
 
   // Contains the locations that each RDD's partitions are cached on
@@ -119,6 +115,10 @@ class DAGScheduler(
   private val dagSchedulerActorSupervisor =
     env.actorSystem.actorOf(Props(new DAGSchedulerActorSupervisor(this)))
 
+  // A closure serializer that we reuse.
+  // This is only safe because DAGScheduler runs in a single thread.
+  private val closureSerializer = SparkEnv.get.closureSerializer.newInstance()
+
   private[scheduler] var eventProcessActor: ActorRef = _
 
   private def initializeEventProcessActor() {
@@ -154,6 +154,23 @@ class DAGScheduler(
     eventProcessActor ! CompletionEvent(task, reason, result, accumUpdates, taskInfo, taskMetrics)
   }
 
+  /**
+   * Update metrics for in-progress tasks and let the master know that the BlockManager is still
+   * alive. Return true if the driver knows about the given block manager. Otherwise, return false,
+   * indicating that the block manager should re-register.
+   */
+  def executorHeartbeatReceived(
+      execId: String,
+      taskMetrics: Array[(Long, Int, TaskMetrics)], // (taskId, stageId, metrics)
+      blockManagerId: BlockManagerId): Boolean = {
+    listenerBus.post(SparkListenerExecutorMetricsUpdate(execId, taskMetrics))
+    implicit val timeout = Timeout(600 seconds)
+
+    Await.result(
+      blockManagerMaster.driverActor ? BlockManagerHeartbeat(blockManagerId),
+      timeout.duration).asInstanceOf[Boolean]
+  }
+
   // Called by TaskScheduler when an executor fails.
   def executorLost(execId: String) {
     eventProcessActor ! ExecutorLost(execId)
@@ -194,11 +211,15 @@ class DAGScheduler(
     shuffleToMapStage.get(shuffleDep.shuffleId) match {
       case Some(stage) => stage
       case None =>
+        // We are going to register ancestor shuffle dependencies
+        registerShuffleDependencies(shuffleDep, jobId)
+        // Then register current shuffleDep
         val stage =
           newOrUsedStage(
             shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId,
             shuffleDep.rdd.creationSite)
         shuffleToMapStage(shuffleDep.shuffleId) = stage
+ 
         stage
     }
   }
@@ -222,7 +243,6 @@ class DAGScheduler(
       new Stage(id, rdd, numTasks, shuffleDep, getParentStages(rdd, jobId), jobId, callSite)
     stageIdToStage(id) = stage
     updateJobIdStageIdMaps(jobId, stage)
-    stageToInfos(stage) = StageInfo.fromStage(stage)
     stage
   }
 
@@ -264,6 +284,9 @@ class DAGScheduler(
   private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = {
     val parents = new HashSet[Stage]
     val visited = new HashSet[RDD[_]]
+    // We are manually maintaining a stack here to prevent StackOverflowError
+    // caused by recursively visiting
+    val waitingForVisit = new Stack[RDD[_]]
     def visit(r: RDD[_]) {
       if (!visited(r)) {
         visited += r
@@ -274,18 +297,69 @@ class DAGScheduler(
             case shufDep: ShuffleDependency[_, _, _] =>
               parents += getShuffleMapStage(shufDep, jobId)
             case _ =>
-              visit(dep.rdd)
+              waitingForVisit.push(dep.rdd)
           }
         }
       }
     }
-    visit(rdd)
+    waitingForVisit.push(rdd)
+    while (!waitingForVisit.isEmpty) {
+      visit(waitingForVisit.pop())
+    }
     parents.toList
   }
 
+  // Find ancestor missing shuffle dependencies and register into shuffleToMapStage
+  private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], jobId: Int) = {
+    val parentsWithNoMapStage = getAncestorShuffleDependencies(shuffleDep.rdd)
+    while (!parentsWithNoMapStage.isEmpty) {
+      val currentShufDep = parentsWithNoMapStage.pop()
+      val stage =
+        newOrUsedStage(
+          currentShufDep.rdd, currentShufDep.rdd.partitions.size, currentShufDep, jobId,
+          currentShufDep.rdd.creationSite)
+      shuffleToMapStage(currentShufDep.shuffleId) = stage
+    }
+  }
+
+  // Find ancestor shuffle dependencies that are not registered in shuffleToMapStage yet
+  private def getAncestorShuffleDependencies(rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = {
+    val parents = new Stack[ShuffleDependency[_, _, _]]
+    val visited = new HashSet[RDD[_]]
+    // We are manually maintaining a stack here to prevent StackOverflowError
+    // caused by recursively visiting
+    val waitingForVisit = new Stack[RDD[_]]
+    def visit(r: RDD[_]) {
+      if (!visited(r)) {
+        visited += r
+        for (dep <- r.dependencies) {
+          dep match {
+            case shufDep: ShuffleDependency[_, _, _] =>
+              if (!shuffleToMapStage.contains(shufDep.shuffleId)) {
+                parents.push(shufDep)
+              }
+
+              waitingForVisit.push(shufDep.rdd)
+            case _ =>
+              waitingForVisit.push(dep.rdd)
+          }
+        }
+      }
+    }
+
+    waitingForVisit.push(rdd)
+    while (!waitingForVisit.isEmpty) {
+      visit(waitingForVisit.pop())
+    }
+    parents
+  }
+
   private def getMissingParentStages(stage: Stage): List[Stage] = {
     val missing = new HashSet[Stage]
     val visited = new HashSet[RDD[_]]
+    // We are manually maintaining a stack here to prevent StackOverflowError
+    // caused by recursively visiting
+    val waitingForVisit = new Stack[RDD[_]]
     def visit(rdd: RDD[_]) {
       if (!visited(rdd)) {
         visited += rdd
@@ -298,13 +372,16 @@ class DAGScheduler(
                   missing += mapStage
                 }
               case narrowDep: NarrowDependency[_] =>
-                visit(narrowDep.rdd)
+                waitingForVisit.push(narrowDep.rdd)
             }
           }
         }
       }
     }
-    visit(stage.rdd)
+    waitingForVisit.push(stage.rdd)
+    while (!waitingForVisit.isEmpty) {
+      visit(waitingForVisit.pop())
+    }
     missing.toList
   }
 
@@ -314,13 +391,12 @@ class DAGScheduler(
    */
   private def updateJobIdStageIdMaps(jobId: Int, stage: Stage) {
     def updateJobIdStageIdMapsList(stages: List[Stage]) {
-      if (!stages.isEmpty) {
+      if (stages.nonEmpty) {
         val s = stages.head
-        stageIdToJobIds.getOrElseUpdate(s.id, new HashSet[Int]()) += jobId
+        s.jobIds += jobId
         jobIdToStageIds.getOrElseUpdate(jobId, new HashSet[Int]()) += s.id
-        val parents = getParentStages(s.rdd, jobId)
-        val parentsWithoutThisJobId = parents.filter(p =>
-          !stageIdToJobIds.get(p.id).exists(_.contains(jobId)))
+        val parents: List[Stage] = getParentStages(s.rdd, jobId)
+        val parentsWithoutThisJobId = parents.filter { ! _.jobIds.contains(jobId) }
         updateJobIdStageIdMapsList(parentsWithoutThisJobId ++ stages.tail)
       }
     }
@@ -332,16 +408,15 @@ class DAGScheduler(
    * handle cancelling tasks or notifying the SparkListener about finished jobs/stages/tasks.
    *
    * @param job The job whose state to cleanup.
-   * @param resultStage Specifies the result stage for the job; if set to None, this method
-   *                    searches resultStagesToJob to find and cleanup the appropriate result stage.
    */
-  private def cleanupStateForJobAndIndependentStages(job: ActiveJob, resultStage: Option[Stage]) {
+  private def cleanupStateForJobAndIndependentStages(job: ActiveJob) {
     val registeredStages = jobIdToStageIds.get(job.jobId)
     if (registeredStages.isEmpty || registeredStages.get.isEmpty) {
       logError("No stages registered for job " + job.jobId)
     } else {
-      stageIdToJobIds.filterKeys(stageId => registeredStages.get.contains(stageId)).foreach {
-        case (stageId, jobSet) =>
+      stageIdToStage.filterKeys(stageId => registeredStages.get.contains(stageId)).foreach {
+        case (stageId, stage) =>
+          val jobSet = stage.jobIds
           if (!jobSet.contains(job.jobId)) {
             logError(
               "Job %d not registered for stage %d even though that stage was registered for the job"
@@ -354,14 +429,9 @@ class DAGScheduler(
                   logDebug("Removing running stage %d".format(stageId))
                   runningStages -= stage
                 }
-                stageToInfos -= stage
                 for ((k, v) <- shuffleToMapStage.find(_._2 == stage)) {
                   shuffleToMapStage.remove(k)
                 }
-                if (pendingTasks.contains(stage) && !pendingTasks(stage).isEmpty) {
-                  logDebug("Removing pending status for stage %d".format(stageId))
-                }
-                pendingTasks -= stage
                 if (waitingStages.contains(stage)) {
                   logDebug("Removing stage %d from waiting set.".format(stageId))
                   waitingStages -= stage
@@ -373,10 +443,6 @@ class DAGScheduler(
               }
               // data structures based on StageId
               stageIdToStage -= stageId
-              stageIdToJobIds -= stageId
-
-              ShuffleMapTask.removeStage(stageId)
-              ResultTask.removeStage(stageId)
 
               logDebug("After removal of stage %d, remaining stages = %d"
                 .format(stageId, stageIdToStage.size))
@@ -392,19 +458,7 @@ class DAGScheduler(
     jobIdToStageIds -= job.jobId
     jobIdToActiveJob -= job.jobId
     activeJobs -= job
-
-    if (resultStage.isEmpty) {
-      // Clean up result stages.
-      val resultStagesForJob = resultStageToJob.keySet.filter(
-        stage => resultStageToJob(stage).jobId == job.jobId)
-      if (resultStagesForJob.size != 1) {
-        logWarning(
-          s"${resultStagesForJob.size} result stages for job ${job.jobId} (expect exactly 1)")
-      }
-      resultStageToJob --= resultStagesForJob
-    } else {
-      resultStageToJob -= resultStage.get
-    }
+    job.finalStage.resultOfJob = None
   }
 
   /**
@@ -454,7 +508,7 @@ class DAGScheduler(
     waiter.awaitResult() match {
       case JobSucceeded => {}
       case JobFailed(exception: Exception) =>
-        logInfo("Failed to run " + callSite.short)
+        logInfo("Failed to run " + callSite.shortForm)
         throw exception
     }
   }
@@ -590,9 +644,10 @@ class DAGScheduler(
         job.listener.jobFailed(exception)
     } finally {
       val s = job.finalStage
-      stageIdToJobIds -= s.id    // clean up data structures that were populated for a local job,
-      stageIdToStage -= s.id     // but that won't get cleaned up via the normal paths through
-      stageToInfos -= s          // completion events or stage abort
+      // clean up data structures that were populated for a local job,
+      // but that won't get cleaned up via the normal paths through
+      // completion events or stage abort
+      stageIdToStage -= s.id
       jobIdToStageIds -= job.jobId
       listenerBus.post(SparkListenerJobEnd(job.jobId, jobResult))
     }
@@ -604,12 +659,8 @@ class DAGScheduler(
   // That should take care of at least part of the priority inversion problem with
   // cross-job dependencies.
   private def activeJobForStage(stage: Stage): Option[Int] = {
-    if (stageIdToJobIds.contains(stage.id)) {
-      val jobsThatUseStage: Array[Int] = stageIdToJobIds(stage.id).toArray.sorted
-      jobsThatUseStage.find(jobIdToActiveJob.contains)
-    } else {
-      None
-    }
+    val jobsThatUseStage: Array[Int] = stage.jobIds.toArray.sorted
+    jobsThatUseStage.find(jobIdToActiveJob.contains)
   }
 
   private[scheduler] def handleJobGroupCancelled(groupId: String) {
@@ -641,9 +692,8 @@ class DAGScheduler(
       // is in the process of getting stopped.
       val stageFailedMessage = "Stage cancelled because SparkContext was shut down"
       runningStages.foreach { stage =>
-        val info = stageToInfos(stage)
-        info.stageFailed(stageFailedMessage)
-        listenerBus.post(SparkListenerStageCompleted(info))
+        stage.info.stageFailed(stageFailedMessage)
+        listenerBus.post(SparkListenerStageCompleted(stage.info))
       }
       listenerBus.post(SparkListenerJobEnd(job.jobId, JobFailed(error)))
     }
@@ -678,7 +728,7 @@ class DAGScheduler(
       val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
       clearCacheLocs()
       logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
-        job.jobId, callSite.short, partitions.length, allowLocal))
+        job.jobId, callSite.shortForm, partitions.length, allowLocal))
       logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
       logInfo("Parents of final stage: " + finalStage.parents)
       logInfo("Missing parents: " + getMissingParentStages(finalStage))
@@ -689,7 +739,7 @@ class DAGScheduler(
       } else {
         jobIdToActiveJob(jobId) = job
         activeJobs += job
-        resultStageToJob(finalStage) = job
+        finalStage.resultOfJob = Some(job)
         listenerBus.post(SparkListenerJobStart(job.jobId, jobIdToStageIds(jobId).toArray,
           properties))
         submitStage(finalStage)
@@ -709,7 +759,6 @@ class DAGScheduler(
         if (missing == Nil) {
           logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
           submitMissingTasks(stage, jobId.get)
-          runningStages += stage
         } else {
           for (parent <- missing) {
             submitStage(parent)
@@ -722,61 +771,104 @@ class DAGScheduler(
     }
   }
 
-
   /** Called when stage's parents are available and we can now do its task. */
   private def submitMissingTasks(stage: Stage, jobId: Int) {
     logDebug("submitMissingTasks(" + stage + ")")
     // Get our pending tasks and remember them in our pendingTasks entry
-    val myPending = pendingTasks.getOrElseUpdate(stage, new HashSet)
-    myPending.clear()
+    stage.pendingTasks.clear()
     var tasks = ArrayBuffer[Task[_]]()
+
+    val properties = if (jobIdToActiveJob.contains(jobId)) {
+      jobIdToActiveJob(stage.jobId).properties
+    } else {
+      // this stage will be assigned to "default" pool
+      null
+    }
+
+    runningStages += stage
+    // SparkListenerStageSubmitted should be posted before testing whether tasks are
+    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
+    // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
+    // event.
+    listenerBus.post(SparkListenerStageSubmitted(stage.info, properties))
+
+    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
+    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
+    // the serialized copy of the RDD and for each task we will deserialize it, which means each
+    // task gets a different copy of the RDD. This provides stronger isolation between tasks that
+    // might modify state of objects referenced in their closures. This is necessary in Hadoop
+    // where the JobConf/Configuration object is not thread-safe.
+    var taskBinary: Broadcast[Array[Byte]] = null
+    try {
+      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
+      // For ResultTask, serialize and broadcast (rdd, func).
+      val taskBinaryBytes: Array[Byte] =
+        if (stage.isShuffleMap) {
+          closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()
+        } else {
+          closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()
+        }
+      taskBinary = sc.broadcast(taskBinaryBytes)
+    } catch {
+      // In the case of a failure during serialization, abort the stage.
+      case e: NotSerializableException =>
+        abortStage(stage, "Task not serializable: " + e.toString)
+        runningStages -= stage
+        return
+      case NonFatal(e) =>
+        abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
+        runningStages -= stage
+        return
+    }
+
     if (stage.isShuffleMap) {
       for (p <- 0 until stage.numPartitions if stage.outputLocs(p) == Nil) {
         val locs = getPreferredLocs(stage.rdd, p)
-        tasks += new ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep.get, p, locs)
+        val part = stage.rdd.partitions(p)
+        tasks += new ShuffleMapTask(stage.id, taskBinary, part, locs)
       }
     } else {
       // This is a final stage; figure out its job's missing partitions
-      val job = resultStageToJob(stage)
+      val job = stage.resultOfJob.get
       for (id <- 0 until job.numPartitions if !job.finished(id)) {
-        val partition = job.partitions(id)
-        val locs = getPreferredLocs(stage.rdd, partition)
-        tasks += new ResultTask(stage.id, stage.rdd, job.func, partition, locs, id)
+        val p: Int = job.partitions(id)
+        val part = stage.rdd.partitions(p)
+        val locs = getPreferredLocs(stage.rdd, p)
+        tasks += new ResultTask(stage.id, taskBinary, part, locs, id)
       }
     }
 
-    val properties = if (jobIdToActiveJob.contains(jobId)) {
-      jobIdToActiveJob(stage.jobId).properties
-    } else {
-      // this stage will be assigned to "default" pool
-      null
-    }
-
-    // must be run listener before possible NotSerializableException
-    // should be "StageSubmitted" first and then "JobEnded"
-    listenerBus.post(SparkListenerStageSubmitted(stageToInfos(stage), properties))
-
     if (tasks.size > 0) {
       // Preemptively serialize a task to make sure it can be serialized. We are catching this
       // exception here because it would be fairly hard to catch the non-serializable exception
       // down the road, where we have several different implementations for local scheduler and
       // cluster schedulers.
+      //
+      // We've already serialized RDDs and closures in taskBinary, but here we check for all other
+      // objects such as Partition.
       try {
-        SparkEnv.get.closureSerializer.newInstance().serialize(tasks.head)
+        closureSerializer.serialize(tasks.head)
       } catch {
         case e: NotSerializableException =>
           abortStage(stage, "Task not serializable: " + e.toString)
           runningStages -= stage
           return
+        case NonFatal(e) => // Other exceptions, such as IllegalArgumentException from Kryo.
+          abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
+          runningStages -= stage
+          return
       }
 
       logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
-      myPending ++= tasks
-      logDebug("New pending tasks: " + myPending)
+      stage.pendingTasks ++= tasks
+      logDebug("New pending tasks: " + stage.pendingTasks)
       taskScheduler.submitTasks(
         new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
-      stageToInfos(stage).submissionTime = Some(clock.getTime())
+      stage.info.submissionTime = Some(clock.getTime())
     } else {
+      // Because we posted SparkListenerStageSubmitted earlier, we should post
+      // SparkListenerStageCompleted here in case there are no tasks to run.
+      listenerBus.post(SparkListenerStageCompleted(stage.info))
       logDebug("Stage " + stage + " is actually done; %b %d %d".format(
         stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
       runningStages -= stage
@@ -806,19 +898,18 @@ class DAGScheduler(
     val stage = stageIdToStage(task.stageId)
 
     def markStageAsFinished(stage: Stage) = {
-      val serviceTime = stageToInfos(stage).submissionTime match {
+      val serviceTime = stage.info.submissionTime match {
         case Some(t) => "%.03f".format((clock.getTime() - t) / 1000.0)
         case _ => "Unknown"
       }
       logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
-      stageToInfos(stage).completionTime = Some(clock.getTime())
-      listenerBus.post(SparkListenerStageCompleted(stageToInfos(stage)))
+      stage.info.completionTime = Some(clock.getTime())
+      listenerBus.post(SparkListenerStageCompleted(stage.info))
       runningStages -= stage
     }
 
     event.reason match {
       case Success =>
-        logInfo("Completed " + task)
         if (event.accumUpdates != null) {
           // TODO: fail the stage if the accumulator update fails...
           Accumulators.add(event.accumUpdates) // TODO: do this only if task wasn't resubmitted
@@ -837,10 +928,10 @@ class DAGScheduler(
           listenerBus.post(SparkListenerTaskEnd(stageId, taskType, event.reason, event.taskInfo,
             event.taskMetrics))
         }
-        pendingTasks(stage) -= task
+        stage.pendingTasks -= task
         task match {
           case rt: ResultTask[_, _] =>
-            resultStageToJob.get(stage) match {
+            stage.resultOfJob match {
               case Some(job) =>
                 if (!job.finished(rt.outputId)) {
                   job.finished(rt.outputId) = true
@@ -848,7 +939,7 @@ class DAGScheduler(
                   // If the whole job has finished, remove it
                   if (job.numFinished == job.numPartitions) {
                     markStageAsFinished(stage)
-                    cleanupStateForJobAndIndependentStages(job, Some(stage))
+                    cleanupStateForJobAndIndependentStages(job)
                     listenerBus.post(SparkListenerJobEnd(job.jobId, JobSucceeded))
                   }
 
@@ -875,7 +966,7 @@ class DAGScheduler(
             } else {
               stage.addOutputLoc(smt.partitionId, status)
             }
-            if (runningStages.contains(stage) && pendingTasks(stage).isEmpty) {
+            if (runningStages.contains(stage) && stage.pendingTasks.isEmpty) {
               markStageAsFinished(stage)
               logInfo("looking for newly runnable stages")
               logInfo("running: " + runningStages)
@@ -924,7 +1015,7 @@ class DAGScheduler(
 
       case Resubmitted =>
         logInfo("Resubmitted " + task + ", so marking it as still running")
-        pendingTasks(stage) += task
+        stage.pendingTasks += task
 
       case FetchFailed(bmAddress, shuffleId, mapId, reduceId) =>
         // Mark the stage that the reducer was in as unrunnable
@@ -1009,13 +1100,14 @@ class DAGScheduler(
   }
 
   private[scheduler] def handleStageCancellation(stageId: Int) {
-    if (stageIdToJobIds.contains(stageId)) {
-      val jobsThatUseStage: Array[Int] = stageIdToJobIds(stageId).toArray
-      jobsThatUseStage.foreach(jobId => {
-        handleJobCancellation(jobId, "because Stage %s was cancelled".format(stageId))
-      })
-    } else {
-      logInfo("No active jobs to kill for Stage " + stageId)
+    stageIdToStage.get(stageId) match {
+      case Some(stage) =>
+        val jobsThatUseStage: Array[Int] = stage.jobIds.toArray
+        jobsThatUseStage.foreach { jobId =>
+          handleJobCancellation(jobId, s"because Stage $stageId was cancelled")
+        }
+      case None =>
+        logInfo("No active jobs to kill for Stage " + stageId)
     }
     submitWaitingStages()
   }
@@ -1024,8 +1116,8 @@ class DAGScheduler(
     if (!jobIdToStageIds.contains(jobId)) {
       logDebug("Trying to cancel unregistered job " + jobId)
     } else {
-      failJobAndIndependentStages(jobIdToActiveJob(jobId),
-        "Job %d cancelled %s".format(jobId, reason), None)
+      failJobAndIndependentStages(
+        jobIdToActiveJob(jobId), "Job %d cancelled %s".format(jobId, reason))
     }
     submitWaitingStages()
   }
@@ -1039,26 +1131,21 @@ class DAGScheduler(
       // Skip all the actions if the stage has been removed.
       return
     }
-    val dependentStages = resultStageToJob.keys.filter(x => stageDependsOn(x, failedStage)).toSeq
-    stageToInfos(failedStage).completionTime = Some(clock.getTime())
-    for (resultStage <- dependentStages) {
-      val job = resultStageToJob(resultStage)
-      failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason",
-        Some(resultStage))
+    val dependentJobs: Seq[ActiveJob] =
+      activeJobs.filter(job => stageDependsOn(job.finalStage, failedStage)).toSeq
+    failedStage.info.completionTime = Some(clock.getTime())
+    for (job <- dependentJobs) {
+      failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason")
     }
-    if (dependentStages.isEmpty) {
+    if (dependentJobs.isEmpty) {
       logInfo("Ignoring failure of " + failedStage + " because all jobs depending on it are done")
     }
   }
 
   /**
    * Fails a job and all stages that are only used by that job, and cleans up relevant state.
-   *
-   * @param resultStage The result stage for the job, if known. Used to cleanup state for the job
-   *                    slightly more efficiently than when not specified.
    */
-  private def failJobAndIndependentStages(job: ActiveJob, failureReason: String,
-      resultStage: Option[Stage]) {
+  private def failJobAndIndependentStages(job: ActiveJob, failureReason: String) {
     val error = new SparkException(failureReason)
     var ableToCancelStages = true
 
@@ -1072,7 +1159,7 @@ class DAGScheduler(
       logError("No stages registered for job " + job.jobId)
     }
     stages.foreach { stageId =>
-      val jobsForStage = stageIdToJobIds.get(stageId)
+      val jobsForStage: Option[HashSet[Int]] = stageIdToStage.get(stageId).map(_.jobIds)
       if (jobsForStage.isEmpty || !jobsForStage.get.contains(job.jobId)) {
         logError(
           "Job %d not registered for stage %d even though that stage was registered for the job"
@@ -1086,9 +1173,8 @@ class DAGScheduler(
           if (runningStages.contains(stage)) {
             try { // cancelTasks will fail if a SchedulerBackend does not implement killTask
               taskScheduler.cancelTasks(stageId, shouldInterruptThread)
-              val stageInfo = stageToInfos(stage)
-              stageInfo.stageFailed(failureReason)
-              listenerBus.post(SparkListenerStageCompleted(stageToInfos(stage)))
+              stage.info.stageFailed(failureReason)
+              listenerBus.post(SparkListenerStageCompleted(stage.info))
             } catch {
               case e: UnsupportedOperationException =>
                 logInfo(s"Could not cancel tasks for stage $stageId", e)
@@ -1101,7 +1187,7 @@ class DAGScheduler(
 
     if (ableToCancelStages) {
       job.listener.jobFailed(error)
-      cleanupStateForJobAndIndependentStages(job, resultStage)
+      cleanupStateForJobAndIndependentStages(job)
       listenerBus.post(SparkListenerJobEnd(job.jobId, JobFailed(error)))
     }
   }
@@ -1115,6 +1201,9 @@ class DAGScheduler(
     }
     val visitedRdds = new HashSet[RDD[_]]
     val visitedStages = new HashSet[Stage]
+    // We are manually maintaining a stack here to prevent StackOverflowError
+    // caused by recursively visiting
+    val waitingForVisit = new Stack[RDD[_]]
     def visit(rdd: RDD[_]) {
       if (!visitedRdds(rdd)) {
         visitedRdds += rdd
@@ -1124,15 +1213,18 @@ class DAGScheduler(
               val mapStage = getShuffleMapStage(shufDep, stage.jobId)
               if (!mapStage.isAvailable) {
                 visitedStages += mapStage
-                visit(mapStage.rdd)
+                waitingForVisit.push(mapStage.rdd)
               }  // Otherwise there's no need to follow the dependency back
             case narrowDep: NarrowDependency[_] =>
-              visit(narrowDep.rdd)
+              waitingForVisit.push(narrowDep.rdd)
           }
         }
       }
     }
-    visit(stage.rdd)
+    waitingForVisit.push(stage.rdd)
+    while (!waitingForVisit.isEmpty) {
+      visit(waitingForVisit.pop())
+    }
     visitedRdds.contains(target.rdd)
   }
 
@@ -1144,6 +1236,22 @@ class DAGScheduler(
    */
   private[spark]
   def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = synchronized {
+    getPreferredLocsInternal(rdd, partition, new HashSet)
+  }
+
+  /** Recursive implementation for getPreferredLocs. */
+  private def getPreferredLocsInternal(
+      rdd: RDD[_],
+      partition: Int,
+      visited: HashSet[(RDD[_],Int)])
+    : Seq[TaskLocation] =
+  {
+    // If the partition has already been visited, no need to re-visit.
+    // This avoids exponential path exploration.  SPARK-695
+    if (!visited.add((rdd,partition))) {
+      // Nil has already been returned for previously visited partitions.
+      return Nil
+    }
     // If the partition is cached, return the cache locations
     val cached = getCacheLocs(rdd)(partition)
     if (!cached.isEmpty) {
@@ -1160,7 +1268,7 @@ class DAGScheduler(
     rdd.dependencies.foreach {
       case n: NarrowDependency[_] =>
         for (inPart <- n.getParents(partition)) {
-          val locs = getPreferredLocs(n.rdd, inPart)
+          val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
           if (locs != Nil) {
             return locs
           }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index a90b0d475c04e..406147f167bf3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -29,7 +29,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.{Logging, SparkConf, SparkContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
-import org.apache.spark.util.{FileLogger, JsonProtocol}
+import org.apache.spark.util.{FileLogger, JsonProtocol, Utils}
 
 /**
  * A SparkListener that logs events to persistent storage.
@@ -55,7 +55,7 @@ private[spark] class EventLoggingListener(
   private val outputBufferSize = sparkConf.getInt("spark.eventLog.buffer.kb", 100) * 1024
   private val logBaseDir = sparkConf.get("spark.eventLog.dir", DEFAULT_LOG_DIR).stripSuffix("/")
   private val name = appName.replaceAll("[ :/]", "-").toLowerCase + "-" + System.currentTimeMillis
-  val logDir = logBaseDir + "/" + name
+  val logDir = Utils.resolveURI(logBaseDir) + "/" + name.stripSuffix("/")
 
   protected val logger = new FileLogger(logDir, sparkConf, hadoopConf, outputBufferSize,
     shouldCompress, shouldOverwrite, Some(LOG_FILE_PERMISSIONS))
@@ -63,6 +63,13 @@ private[spark] class EventLoggingListener(
   // For testing. Keep track of all JSON serialized events that have been logged.
   private[scheduler] val loggedEvents = new ArrayBuffer[JValue]
 
+  /**
+   * Return only the unique application directory without the base directory.
+   */
+  def getApplicationLogDir(): String = {
+    name
+  }
+
   /**
    * Begin logging events.
    * If compression is used, log a file that indicates which compression library is used.
@@ -208,7 +215,7 @@ private[spark] object EventLoggingListener extends Logging {
     } catch {
       case e: Exception =>
         logError("Exception in parsing logging info from directory %s".format(logDir), e)
-      EventLoggingInfo.empty
+        EventLoggingInfo.empty
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index bbf9f7388b074..d09fd7aa57642 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -17,134 +17,56 @@
 
 package org.apache.spark.scheduler
 
-import scala.language.existentials
+import java.nio.ByteBuffer
 
 import java.io._
-import java.util.zip.{GZIPInputStream, GZIPOutputStream}
-
-import scala.collection.mutable.HashMap
 
 import org.apache.spark._
-import org.apache.spark.rdd.{RDD, RDDCheckpointData}
-
-private[spark] object ResultTask {
-
-  // A simple map between the stage id to the serialized byte array of a task.
-  // Served as a cache for task serialization because serialization can be
-  // expensive on the master node if it needs to launch thousands of tasks.
-  private val serializedInfoCache = new HashMap[Int, Array[Byte]]
-
-  def serializeInfo(stageId: Int, rdd: RDD[_], func: (TaskContext, Iterator[_]) => _): Array[Byte] =
-  {
-    synchronized {
-      val old = serializedInfoCache.get(stageId).orNull
-      if (old != null) {
-        old
-      } else {
-        val out = new ByteArrayOutputStream
-        val ser = SparkEnv.get.closureSerializer.newInstance()
-        val objOut = ser.serializeStream(new GZIPOutputStream(out))
-        objOut.writeObject(rdd)
-        objOut.writeObject(func)
-        objOut.close()
-        val bytes = out.toByteArray
-        serializedInfoCache.put(stageId, bytes)
-        bytes
-      }
-    }
-  }
-
-  def deserializeInfo(stageId: Int, bytes: Array[Byte]): (RDD[_], (TaskContext, Iterator[_]) => _) =
-  {
-    val in = new GZIPInputStream(new ByteArrayInputStream(bytes))
-    val ser = SparkEnv.get.closureSerializer.newInstance()
-    val objIn = ser.deserializeStream(in)
-    val rdd = objIn.readObject().asInstanceOf[RDD[_]]
-    val func = objIn.readObject().asInstanceOf[(TaskContext, Iterator[_]) => _]
-    (rdd, func)
-  }
-
-  def removeStage(stageId: Int) {
-    serializedInfoCache.remove(stageId)
-  }
-
-  def clearCache() {
-    synchronized {
-      serializedInfoCache.clear()
-    }
-  }
-}
-
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
 
 /**
  * A task that sends back the output to the driver application.
  *
- * See [[org.apache.spark.scheduler.Task]] for more information.
+ * See [[Task]] for more information.
  *
  * @param stageId id of the stage this task belongs to
- * @param rdd input to func
- * @param func a function to apply on a partition of the RDD
- * @param _partitionId index of the number in the RDD
+ * @param taskBinary broadcasted version of the serialized RDD and the function to apply on each
+ *                   partition of the given RDD. Once deserialized, the type should be
+ *                   (RDD[T], (TaskContext, Iterator[T]) => U).
+ * @param partition partition of the RDD this task is associated with
  * @param locs preferred task execution locations for locality scheduling
  * @param outputId index of the task in this job (a job can launch tasks on only a subset of the
  *                 input RDD's partitions).
  */
 private[spark] class ResultTask[T, U](
     stageId: Int,
-    var rdd: RDD[T],
-    var func: (TaskContext, Iterator[T]) => U,
-    _partitionId: Int,
+    taskBinary: Broadcast[Array[Byte]],
+    partition: Partition,
     @transient locs: Seq[TaskLocation],
-    var outputId: Int)
-  extends Task[U](stageId, _partitionId) with Externalizable {
+    val outputId: Int)
+  extends Task[U](stageId, partition.index) with Serializable {
 
-  def this() = this(0, null, null, 0, null, 0)
-
-  var split = if (rdd == null) null else rdd.partitions(partitionId)
-
-  @transient private val preferredLocs: Seq[TaskLocation] = {
+  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
     if (locs == null) Nil else locs.toSet.toSeq
   }
 
   override def runTask(context: TaskContext): U = {
+    // Deserialize the RDD and the func using the broadcast variables.
+    val ser = SparkEnv.get.closureSerializer.newInstance()
+    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
+      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
+
     metrics = Some(context.taskMetrics)
     try {
-      func(context, rdd.iterator(split, context))
+      func(context, rdd.iterator(partition, context))
     } finally {
       context.executeOnCompleteCallbacks()
     }
   }
 
+  // This is only callable on the driver side.
   override def preferredLocations: Seq[TaskLocation] = preferredLocs
 
   override def toString = "ResultTask(" + stageId + ", " + partitionId + ")"
-
-  override def writeExternal(out: ObjectOutput) {
-    RDDCheckpointData.synchronized {
-      split = rdd.partitions(partitionId)
-      out.writeInt(stageId)
-      val bytes = ResultTask.serializeInfo(
-        stageId, rdd, func.asInstanceOf[(TaskContext, Iterator[_]) => _])
-      out.writeInt(bytes.length)
-      out.write(bytes)
-      out.writeInt(partitionId)
-      out.writeInt(outputId)
-      out.writeLong(epoch)
-      out.writeObject(split)
-    }
-  }
-
-  override def readExternal(in: ObjectInput) {
-    val stageId = in.readInt()
-    val numBytes = in.readInt()
-    val bytes = new Array[Byte](numBytes)
-    in.readFully(bytes)
-    val (rdd_, func_) = ResultTask.deserializeInfo(stageId, bytes)
-    rdd = rdd_.asInstanceOf[RDD[T]]
-    func = func_.asInstanceOf[(TaskContext, Iterator[T]) => U]
-    partitionId = in.readInt()
-    outputId = in.readInt()
-    epoch = in.readLong()
-    split = in.readObject().asInstanceOf[Partition]
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
index 6a6d8e609bc39..e41e0a9841691 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala
@@ -30,4 +30,5 @@ private[spark] trait SchedulerBackend {
 
   def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit =
     throw new UnsupportedOperationException
+  def isReady(): Boolean = true
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
index fdaf1de83f051..11255c07469d4 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -17,134 +17,55 @@
 
 package org.apache.spark.scheduler
 
-import scala.language.existentials
-
-import java.io._
-import java.util.zip.{GZIPInputStream, GZIPOutputStream}
+import java.nio.ByteBuffer
 
-import scala.collection.mutable.HashMap
+import scala.language.existentials
 
 import org.apache.spark._
-import org.apache.spark.rdd.{RDD, RDDCheckpointData}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
 import org.apache.spark.shuffle.ShuffleWriter
 
-private[spark] object ShuffleMapTask {
-
-  // A simple map between the stage id to the serialized byte array of a task.
-  // Served as a cache for task serialization because serialization can be
-  // expensive on the master node if it needs to launch thousands of tasks.
-  private val serializedInfoCache = new HashMap[Int, Array[Byte]]
-
-  def serializeInfo(stageId: Int, rdd: RDD[_], dep: ShuffleDependency[_, _, _]): Array[Byte] = {
-    synchronized {
-      val old = serializedInfoCache.get(stageId).orNull
-      if (old != null) {
-        return old
-      } else {
-        val out = new ByteArrayOutputStream
-        val ser = SparkEnv.get.closureSerializer.newInstance()
-        val objOut = ser.serializeStream(new GZIPOutputStream(out))
-        objOut.writeObject(rdd)
-        objOut.writeObject(dep)
-        objOut.close()
-        val bytes = out.toByteArray
-        serializedInfoCache.put(stageId, bytes)
-        bytes
-      }
-    }
-  }
-
-  def deserializeInfo(stageId: Int, bytes: Array[Byte]): (RDD[_], ShuffleDependency[_, _, _]) = {
-    val in = new GZIPInputStream(new ByteArrayInputStream(bytes))
-    val ser = SparkEnv.get.closureSerializer.newInstance()
-    val objIn = ser.deserializeStream(in)
-    val rdd = objIn.readObject().asInstanceOf[RDD[_]]
-    val dep = objIn.readObject().asInstanceOf[ShuffleDependency[_, _, _]]
-    (rdd, dep)
-  }
-
-  // Since both the JarSet and FileSet have the same format this is used for both.
-  def deserializeFileSet(bytes: Array[Byte]): HashMap[String, Long] = {
-    val in = new GZIPInputStream(new ByteArrayInputStream(bytes))
-    val objIn = new ObjectInputStream(in)
-    val set = objIn.readObject().asInstanceOf[Array[(String, Long)]].toMap
-    HashMap(set.toSeq: _*)
-  }
-
-  def removeStage(stageId: Int) {
-    serializedInfoCache.remove(stageId)
-  }
-
-  def clearCache() {
-    synchronized {
-      serializedInfoCache.clear()
-    }
-  }
-}
-
 /**
- * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
- * specified in the ShuffleDependency).
- *
- * See [[org.apache.spark.scheduler.Task]] for more information.
- *
+* A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
+* specified in the ShuffleDependency).
+*
+* See [[org.apache.spark.scheduler.Task]] for more information.
+*
  * @param stageId id of the stage this task belongs to
- * @param rdd the final RDD in this stage
- * @param dep the ShuffleDependency
- * @param _partitionId index of the number in the RDD
+ * @param taskBinary broadcast version of of the RDD and the ShuffleDependency. Once deserialized,
+ *                   the type should be (RDD[_], ShuffleDependency[_, _, _]).
+ * @param partition partition of the RDD this task is associated with
  * @param locs preferred task execution locations for locality scheduling
  */
 private[spark] class ShuffleMapTask(
     stageId: Int,
-    var rdd: RDD[_],
-    var dep: ShuffleDependency[_, _, _],
-    _partitionId: Int,
+    taskBinary: Broadcast[Array[Byte]],
+    partition: Partition,
     @transient private var locs: Seq[TaskLocation])
-  extends Task[MapStatus](stageId, _partitionId)
-  with Externalizable
-  with Logging {
+  extends Task[MapStatus](stageId, partition.index) with Logging {
 
-  protected def this() = this(0, null, null, 0, null)
+  /** A constructor used only in test suites. This does not require passing in an RDD. */
+  def this(partitionId: Int) {
+    this(0, null, new Partition { override def index = 0 }, null)
+  }
 
   @transient private val preferredLocs: Seq[TaskLocation] = {
     if (locs == null) Nil else locs.toSet.toSeq
   }
 
-  var split = if (rdd == null) null else rdd.partitions(partitionId)
-
-  override def writeExternal(out: ObjectOutput) {
-    RDDCheckpointData.synchronized {
-      split = rdd.partitions(partitionId)
-      out.writeInt(stageId)
-      val bytes = ShuffleMapTask.serializeInfo(stageId, rdd, dep)
-      out.writeInt(bytes.length)
-      out.write(bytes)
-      out.writeInt(partitionId)
-      out.writeLong(epoch)
-      out.writeObject(split)
-    }
-  }
-
-  override def readExternal(in: ObjectInput) {
-    val stageId = in.readInt()
-    val numBytes = in.readInt()
-    val bytes = new Array[Byte](numBytes)
-    in.readFully(bytes)
-    val (rdd_, dep_) = ShuffleMapTask.deserializeInfo(stageId, bytes)
-    rdd = rdd_
-    dep = dep_
-    partitionId = in.readInt()
-    epoch = in.readLong()
-    split = in.readObject().asInstanceOf[Partition]
-  }
-
   override def runTask(context: TaskContext): MapStatus = {
+    // Deserialize the RDD using the broadcast variable.
+    val ser = SparkEnv.get.closureSerializer.newInstance()
+    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
+      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
+
     metrics = Some(context.taskMetrics)
     var writer: ShuffleWriter[Any, Any] = null
     try {
       val manager = SparkEnv.get.shuffleManager
       writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
-      writer.write(rdd.iterator(split, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
+      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
       return writer.stop(success = true).get
     } catch {
       case e: Exception =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
index 82163eadd56e9..d01d318633877 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -75,6 +75,12 @@ case class SparkListenerBlockManagerRemoved(blockManagerId: BlockManagerId)
 @DeveloperApi
 case class SparkListenerUnpersistRDD(rddId: Int) extends SparkListenerEvent
 
+@DeveloperApi
+case class SparkListenerExecutorMetricsUpdate(
+    execId: String,
+    taskMetrics: Seq[(Long, Int, TaskMetrics)])
+  extends SparkListenerEvent
+
 @DeveloperApi
 case class SparkListenerApplicationStart(appName: String, time: Long, sparkUser: String)
   extends SparkListenerEvent
@@ -158,6 +164,11 @@ trait SparkListener {
    * Called when the application ends
    */
   def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) { }
+
+  /**
+   * Called when the driver receives task metrics from an executor in a heartbeat.
+   */
+  def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate) { }
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
index ed9fb24bc8ce8..e79ffd7a3587d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
@@ -68,6 +68,8 @@ private[spark] trait SparkListenerBus extends Logging {
         foreachListener(_.onApplicationStart(applicationStart))
       case applicationEnd: SparkListenerApplicationEnd =>
         foreachListener(_.onApplicationEnd(applicationEnd))
+      case metricsUpdate: SparkListenerExecutorMetricsUpdate =>
+        foreachListener(_.onExecutorMetricsUpdate(metricsUpdate))
       case SparkListenerShutdown =>
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index 8ec482a6f6d9c..800905413d145 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.scheduler
 
+import scala.collection.mutable.HashSet
+
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.BlockManagerId
@@ -56,8 +58,22 @@ private[spark] class Stage(
   val numPartitions = rdd.partitions.size
   val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
   var numAvailableOutputs = 0
+
+  /** Set of jobs that this stage belongs to. */
+  val jobIds = new HashSet[Int]
+
+  /** For stages that are the final (consists of only ResultTasks), link to the ActiveJob. */
+  var resultOfJob: Option[ActiveJob] = None
+  var pendingTasks = new HashSet[Task[_]]
+
   private var nextAttemptId = 0
 
+  val name = callSite.shortForm
+  val details = callSite.longForm
+
+  /** Pointer to the [StageInfo] object, set by DAGScheduler. */
+  var info: StageInfo = StageInfo.fromStage(this)
+
   def isAvailable: Boolean = {
     if (!isShuffleMap) {
       true
@@ -108,9 +124,6 @@ private[spark] class Stage(
 
   def attemptId: Int = nextAttemptId
 
-  val name = callSite.short
-  val details = callSite.long
-
   override def toString = "Stage " + id
 
   override def hashCode(): Int = id
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 5871edeb856ad..5c5e421404a21 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -26,6 +26,8 @@ import org.apache.spark.TaskContext
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.serializer.SerializerInstance
 import org.apache.spark.util.ByteBufferInputStream
+import org.apache.spark.util.Utils
+
 
 /**
  * A unit of execution. We have two kinds of Task's in Spark:
@@ -44,6 +46,7 @@ private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) ex
 
   final def run(attemptId: Long): T = {
     context = new TaskContext(stageId, partitionId, attemptId, runningLocally = false)
+    context.taskMetrics.hostname = Utils.localHostName();
     taskThread = Thread.currentThread()
     if (_killed) {
       kill(interruptThread = false)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
index b330376185b0d..10ea545dcee91 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
@@ -93,6 +93,8 @@ class TaskInfo(
     }
   }
 
+  def id: String = s"$index.$attempt"
+
   def duration: Long = {
     if (!finished) {
       throw new UnsupportedOperationException("duration() called on unfinished task")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
index 819c35257b5a7..1a0b877c8a5e1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.scheduler
 
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.storage.BlockManagerId
 
 /**
  * Low-level task scheduler interface, currently implemented exclusively by TaskSchedulerImpl.
@@ -54,4 +56,12 @@ private[spark] trait TaskScheduler {
 
   // Get the default level of parallelism to use in the cluster, as a hint for sizing jobs.
   def defaultParallelism(): Int
+
+  /**
+   * Update metrics for in-progress tasks and let the master know that the BlockManager is still
+   * alive. Return true if the driver knows about the given block manager. Otherwise, return false,
+   * indicating that the block manager should re-register.
+   */
+  def executorHeartbeatReceived(execId: String, taskMetrics: Array[(Long, TaskMetrics)],
+    blockManagerId: BlockManagerId): Boolean
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 5ed2803d76afc..d2f764fc22f54 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -32,6 +32,9 @@ import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.util.Utils
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.storage.BlockManagerId
+import akka.actor.Props
 
 /**
  * Schedules tasks for multiple types of clusters by acting through a SchedulerBackend.
@@ -88,6 +91,8 @@ private[spark] class TaskSchedulerImpl(
   // in turn is used to decide when we can attain data locality on a given host
   private val executorsByHost = new HashMap[String, HashSet[String]]
 
+  protected val hostsByRack = new HashMap[String, HashSet[String]]
+
   private val executorIdToHost = new HashMap[String, String]
 
   // Listener object to pass upcalls into
@@ -145,6 +150,10 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
+  override def postStartHook() {
+    waitBackendReady()
+  }
+
   override def submitTasks(taskSet: TaskSet) {
     val tasks = taskSet.tasks
     logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
@@ -219,6 +228,9 @@ private[spark] class TaskSchedulerImpl(
         executorAdded(o.executorId, o.host)
         newExecAvail = true
       }
+      for (rack <- getRackForHost(o.host)) {
+        hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
+      }
     }
 
     // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
@@ -311,6 +323,26 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
+  /**
+   * Update metrics for in-progress tasks and let the master know that the BlockManager is still
+   * alive. Return true if the driver knows about the given block manager. Otherwise, return false,
+   * indicating that the block manager should re-register.
+   */
+  override def executorHeartbeatReceived(
+      execId: String,
+      taskMetrics: Array[(Long, TaskMetrics)], // taskId -> TaskMetrics
+      blockManagerId: BlockManagerId): Boolean = {
+    val metricsWithStageIds = taskMetrics.flatMap {
+      case (id, metrics) => {
+        taskIdToTaskSetId.get(id)
+          .flatMap(activeTaskSets.get)
+          .map(_.stageId)
+          .map(x => (id, x, metrics))
+      }
+    }
+    dagScheduler.executorHeartbeatReceived(execId, metricsWithStageIds, blockManagerId)
+  }
+
   def handleTaskGettingResult(taskSetManager: TaskSetManager, tid: Long) {
     taskSetManager.handleTaskGettingResult(tid)
   }
@@ -414,6 +446,12 @@ private[spark] class TaskSchedulerImpl(
     execs -= executorId
     if (execs.isEmpty) {
       executorsByHost -= host
+      for (rack <- getRackForHost(host); hosts <- hostsByRack.get(rack)) {
+        hosts -= host
+        if (hosts.isEmpty) {
+          hostsByRack -= rack
+        }
+      }
     }
     executorIdToHost -= executorId
     rootPool.executorLost(executorId, host)
@@ -431,12 +469,27 @@ private[spark] class TaskSchedulerImpl(
     executorsByHost.contains(host)
   }
 
+  def hasHostAliveOnRack(rack: String): Boolean = synchronized {
+    hostsByRack.contains(rack)
+  }
+
   def isExecutorAlive(execId: String): Boolean = synchronized {
     activeExecutorIds.contains(execId)
   }
 
   // By default, rack is unknown
   def getRackForHost(value: String): Option[String] = None
+
+  private def waitBackendReady(): Unit = {
+    if (backend.isReady) {
+      return
+    }
+    while (!backend.isReady) {
+      synchronized {
+        this.wait(100)
+      }
+    }
+  }
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 059cc9085a2e7..8b5e8cb802a45 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -26,8 +26,7 @@ import scala.collection.mutable.HashSet
 import scala.math.max
 import scala.math.min
 
-import org.apache.spark.{ExceptionFailure, ExecutorLostFailure, FetchFailed, Logging, Resubmitted,
-  SparkEnv, Success, TaskEndReason, TaskKilled, TaskResultLost, TaskState}
+import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.util.{Clock, SystemClock}
@@ -52,8 +51,8 @@ private[spark] class TaskSetManager(
     val taskSet: TaskSet,
     val maxTaskFailures: Int,
     clock: Clock = SystemClock)
-  extends Schedulable with Logging
-{
+  extends Schedulable with Logging {
+
   val conf = sched.sc.conf
 
   /*
@@ -191,7 +190,9 @@ private[spark] class TaskSetManager(
       addTo(pendingTasksForHost.getOrElseUpdate(loc.host, new ArrayBuffer))
       for (rack <- sched.getRackForHost(loc.host)) {
         addTo(pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer))
-        hadAliveLocations = true
+        if(sched.hasHostAliveOnRack(rack)){
+          hadAliveLocations = true
+        }
       }
     }
 
@@ -401,14 +402,11 @@ private[spark] class TaskSetManager(
           // Found a task; do some bookkeeping and return a task description
           val task = tasks(index)
           val taskId = sched.newTaskId()
-          // Figure out whether this should count as a preferred launch
-          logInfo("Starting task %s:%d as TID %s on executor %s: %s (%s)".format(
-            taskSet.id, index, taskId, execId, host, taskLocality))
           // Do various bookkeeping
           copiesRunning(index) += 1
           val attemptNum = taskAttempts(index).size
-          val info = new TaskInfo(
-            taskId, index, attemptNum + 1, curTime, execId, host, taskLocality, speculative)
+          val info = new TaskInfo(taskId, index, attemptNum, curTime,
+            execId, host, taskLocality, speculative)
           taskInfos(taskId) = info
           taskAttempts(index) = info :: taskAttempts(index)
           // Update our locality level for delay scheduling
@@ -427,11 +425,15 @@ private[spark] class TaskSetManager(
               s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " +
               s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
           }
-          val timeTaken = clock.getTime() - startTime
           addRunningTask(taskId)
-          logInfo("Serialized task %s:%d as %d bytes in %d ms".format(
-            taskSet.id, index, serializedTask.limit, timeTaken))
-          val taskName = "task %s:%d".format(taskSet.id, index)
+
+          // We used to log the time it takes to serialize the task, but task size is already
+          // a good proxy to task serialization time.
+          // val timeTaken = clock.getTime() - startTime
+          val taskName = s"task ${info.id} in stage ${taskSet.id}"
+          logInfo("Starting %s (TID %d, %s, %s, %d bytes)".format(
+              taskName, taskId, host, taskLocality, serializedTask.limit))
+
           sched.dagScheduler.taskStarted(task, info)
           return Some(new TaskDescription(taskId, execId, taskName, index, serializedTask))
         }
@@ -490,19 +492,19 @@ private[spark] class TaskSetManager(
     info.markSuccessful()
     removeRunningTask(tid)
     sched.dagScheduler.taskEnded(
-      tasks(index), Success, result.value, result.accumUpdates, info, result.metrics)
+      tasks(index), Success, result.value(), result.accumUpdates, info, result.metrics)
     if (!successful(index)) {
       tasksSuccessful += 1
-      logInfo("Finished TID %s in %d ms on %s (progress: %d/%d)".format(
-        tid, info.duration, info.host, tasksSuccessful, numTasks))
+      logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s (%d/%d)".format(
+        info.id, taskSet.id, info.taskId, info.duration, info.host, tasksSuccessful, numTasks))
       // Mark successful and stop if all the tasks have succeeded.
       successful(index) = true
       if (tasksSuccessful == numTasks) {
         isZombie = true
       }
     } else {
-      logInfo("Ignorning task-finished event for TID " + tid + " because task " +
-        index + " has already completed successfully")
+      logInfo("Ignoring task-finished event for " + info.id + " in stage " + taskSet.id +
+        " because task " + index + " has already completed successfully")
     }
     failedExecutors.remove(index)
     maybeFinishTaskSet()
@@ -521,14 +523,13 @@ private[spark] class TaskSetManager(
     info.markFailed()
     val index = info.index
     copiesRunning(index) -= 1
-    if (!isZombie) {
-      logWarning("Lost TID %s (task %s:%d)".format(tid, taskSet.id, index))
-    }
     var taskMetrics : TaskMetrics = null
-    var failureReason: String = null
+
+    val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}): " +
+      reason.asInstanceOf[TaskFailedReason].toErrorString
     reason match {
       case fetchFailed: FetchFailed =>
-        logWarning("Loss was due to fetch failure from " + fetchFailed.bmAddress)
+        logWarning(failureReason)
         if (!successful(index)) {
           successful(index) = true
           tasksSuccessful += 1
@@ -536,23 +537,17 @@ private[spark] class TaskSetManager(
         // Not adding to failed executors for FetchFailed.
         isZombie = true
 
-      case TaskKilled =>
-        // Not adding to failed executors for TaskKilled.
-        logWarning("Task %d was killed.".format(tid))
-
       case ef: ExceptionFailure =>
-        taskMetrics = ef.metrics.getOrElse(null)
-        if (ef.className == classOf[NotSerializableException].getName()) {
+        taskMetrics = ef.metrics.orNull
+        if (ef.className == classOf[NotSerializableException].getName) {
           // If the task result wasn't serializable, there's no point in trying to re-execute it.
-          logError("Task %s:%s had a not serializable result: %s; not retrying".format(
-            taskSet.id, index, ef.description))
-          abort("Task %s:%s had a not serializable result: %s".format(
-            taskSet.id, index, ef.description))
+          logError("Task %s in stage %s (TID %d) had a not serializable result: %s; not retrying"
+            .format(info.id, taskSet.id, tid, ef.description))
+          abort("Task %s in stage %s (TID %d) had a not serializable result: %s".format(
+            info.id, taskSet.id, tid, ef.description))
           return
         }
         val key = ef.description
-        failureReason = "Exception failure in TID %s on host %s: %s\n%s".format(
-          tid, info.host, ef.description, ef.stackTrace.map("        " + _).mkString("\n"))
         val now = clock.getTime()
         val (printFull, dupCount) = {
           if (recentExceptions.contains(key)) {
@@ -570,19 +565,18 @@ private[spark] class TaskSetManager(
           }
         }
         if (printFull) {
-          val locs = ef.stackTrace.map(loc => "\tat %s".format(loc.toString))
-          logWarning("Loss was due to %s\n%s\n%s".format(
-            ef.className, ef.description, locs.mkString("\n")))
+          logWarning(failureReason)
         } else {
-          logInfo("Loss was due to %s [duplicate %d]".format(ef.description, dupCount))
+          logInfo(
+            s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid) on executor ${info.host}: " +
+            s"${ef.className} (${ef.description}) [duplicate $dupCount]")
         }
 
-      case TaskResultLost =>
-        failureReason = "Lost result for TID %s on host %s".format(tid, info.host)
+      case e: TaskFailedReason =>  // TaskResultLost, TaskKilled, and others
         logWarning(failureReason)
 
-      case _ =>
-        failureReason = "TID %s on host %s failed for unknown reason".format(tid, info.host)
+      case e: TaskEndReason =>
+        logError("Unknown TaskEndReason: " + e)
     }
     // always add to failed executors
     failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()).
@@ -593,10 +587,10 @@ private[spark] class TaskSetManager(
       assert (null != failureReason)
       numFailures(index) += 1
       if (numFailures(index) >= maxTaskFailures) {
-        logError("Task %s:%d failed %d times; aborting job".format(
-          taskSet.id, index, maxTaskFailures))
-        abort("Task %s:%d failed %d times, most recent failure: %s\nDriver stacktrace:".format(
-          taskSet.id, index, maxTaskFailures, failureReason))
+        logError("Task %d in stage %s failed %d times; aborting job".format(
+          index, taskSet.id, maxTaskFailures))
+        abort("Task %d in stage %s failed %d times, most recent failure: %s\nDriver stacktrace:"
+          .format(index, taskSet.id, maxTaskFailures, failureReason))
         return
       }
     }
@@ -709,8 +703,8 @@ private[spark] class TaskSetManager(
         if (!successful(index) && copiesRunning(index) == 1 && info.timeRunning(time) > threshold &&
           !speculatableTasks.contains(index)) {
           logInfo(
-            "Marking task %s:%d (on %s) as speculatable because it ran more than %.0f ms".format(
-              taskSet.id, index, info.host, threshold))
+            "Marking task %d in stage %s (on %s) as speculatable because it ran more than %.0f ms"
+              .format(index, taskSet.id, info.host, threshold))
           speculatableTasks += index
           foundTasks = true
         }
@@ -748,7 +742,8 @@ private[spark] class TaskSetManager(
         pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) {
       levels += NODE_LOCAL
     }
-    if (!pendingTasksForRack.isEmpty && getLocalityWait(RACK_LOCAL) != 0) {
+    if (!pendingTasksForRack.isEmpty && getLocalityWait(RACK_LOCAL) != 0 &&
+        pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) {
       levels += RACK_LOCAL
     }
     levels += ANY
@@ -761,7 +756,8 @@ private[spark] class TaskSetManager(
     def newLocAvail(index: Int): Boolean = {
       for (loc <- tasks(index).preferredLocations) {
         if (sched.hasExecutorsAliveOnHost(loc.host) ||
-            sched.getRackForHost(loc.host).isDefined) {
+           (sched.getRackForHost(loc.host).isDefined &&
+           sched.hasHostAliveOnRack(sched.getRackForHost(loc.host).get))) {
           return true
         }
       }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 318e16552201c..6abf6d930c155 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -66,4 +66,7 @@ private[spark] object CoarseGrainedClusterMessages {
 
   case class RemoveExecutor(executorId: String, reason: String) extends CoarseGrainedClusterMessage
 
+  case class AddWebUIFilter(filterName:String, filterParams: String, proxyBase :String)
+    extends CoarseGrainedClusterMessage
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 05d01b0c821f9..9f085eef46720 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -31,6 +31,7 @@ import org.apache.spark.{SparkEnv, Logging, SparkException, TaskState}
 import org.apache.spark.scheduler.{SchedulerBackend, SlaveLost, TaskDescription, TaskSchedulerImpl, WorkerOffer}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.util.{SerializableBuffer, AkkaUtils, Utils}
+import org.apache.spark.ui.JettyUtils
 
 /**
  * A scheduler backend that waits for coarse grained executors to connect to it through Akka.
@@ -46,9 +47,19 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A
 {
   // Use an atomic variable to track total number of cores in the cluster for simplicity and speed
   var totalCoreCount = new AtomicInteger(0)
+  var totalExpectedExecutors = new AtomicInteger(0)
   val conf = scheduler.sc.conf
   private val timeout = AkkaUtils.askTimeout(conf)
   private val akkaFrameSize = AkkaUtils.maxFrameSizeBytes(conf)
+  // Submit tasks only after (registered executors / total expected executors) 
+  // is equal to at least this value, that is double between 0 and 1.
+  var minRegisteredRatio = conf.getDouble("spark.scheduler.minRegisteredExecutorsRatio", 0)
+  if (minRegisteredRatio > 1) minRegisteredRatio = 1
+  // Whatever minRegisteredExecutorsRatio is arrived, submit tasks after the time(milliseconds).
+  val maxRegisteredWaitingTime =
+    conf.getInt("spark.scheduler.maxRegisteredExecutorsWaitingTime", 30000)
+  val createTime = System.currentTimeMillis()
+  var ready = if (minRegisteredRatio <= 0) true else false
 
   class DriverActor(sparkProperties: Seq[(String, String)]) extends Actor {
     private val executorActor = new HashMap[String, ActorRef]
@@ -83,6 +94,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A
           executorAddress(executorId) = sender.path.address
           addressToExecutorId(sender.path.address) = executorId
           totalCoreCount.addAndGet(cores)
+          if (executorActor.size >= totalExpectedExecutors.get() * minRegisteredRatio && !ready) {
+            ready = true
+            logInfo("SchedulerBackend is ready for scheduling beginning, registered executors: " +
+              executorActor.size + ", total expected executors: " + totalExpectedExecutors.get() +
+              ", minRegisteredExecutorsRatio: " + minRegisteredRatio)
+          }
           makeOffers()
         }
 
@@ -120,6 +137,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A
         removeExecutor(executorId, reason)
         sender ! true
 
+      case AddWebUIFilter(filterName, filterParams, proxyBase) =>
+        addWebUIFilter(filterName, filterParams, proxyBase)
+        sender ! true
       case DisassociatedEvent(_, address, _) =>
         addressToExecutorId.get(address).foreach(removeExecutor(_,
           "remote Akka client disassociated"))
@@ -247,6 +267,33 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A
         throw new SparkException("Error notifying standalone scheduler's driver actor", e)
     }
   }
+
+  override def isReady(): Boolean = {
+    if (ready) {
+      return true
+    }
+    if ((System.currentTimeMillis() - createTime) >= maxRegisteredWaitingTime) {
+      ready = true
+      logInfo("SchedulerBackend is ready for scheduling beginning after waiting " +
+        "maxRegisteredExecutorsWaitingTime: " + maxRegisteredWaitingTime)
+      return true
+    }
+    false
+  }
+
+  // Add filters to the SparkUI
+  def addWebUIFilter(filterName: String, filterParams: String, proxyBase: String) {
+    if (proxyBase != null && proxyBase.nonEmpty) {
+      System.setProperty("spark.ui.proxyBase", proxyBase)
+    }
+
+    if (Seq(filterName, filterParams).forall(t => t != null && t.nonEmpty)) {
+      logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase")
+      conf.set("spark.ui.filters", filterName)
+      conf.set(s"spark.$filterName.params", filterParams)
+      JettyUtils.addFilters(scheduler.sc.ui.getHandlers, conf)
+    }
+  }
 }
 
 private[spark] object CoarseGrainedSchedulerBackend {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index 9c07b3f7b695a..a28446f6c8a6b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.scheduler.cluster
 
-import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.{Logging, SparkConf, SparkContext}
 import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.client.{AppClient, AppClientListener}
 import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason, SlaveLost, TaskSchedulerImpl}
@@ -46,6 +46,7 @@ private[spark] class SparkDeploySchedulerBackend(
       CoarseGrainedSchedulerBackend.ACTOR_NAME)
     val args = Seq(driverUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}", "{{WORKER_URL}}")
     val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions")
+      .map(Utils.splitCommandString).getOrElse(Seq.empty)
     val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath").toSeq.flatMap { cp =>
       cp.split(java.io.File.pathSeparator)
     }
@@ -54,12 +55,13 @@ private[spark] class SparkDeploySchedulerBackend(
         cp.split(java.io.File.pathSeparator)
       }
 
-    val command = Command(
-      "org.apache.spark.executor.CoarseGrainedExecutorBackend", args, sc.executorEnvs,
-      classPathEntries, libraryPathEntries, extraJavaOpts)
-    val sparkHome = sc.getSparkHome()
+    // Start executors with a few necessary configs for registering with the scheduler
+    val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)
+    val javaOpts = sparkJavaOpts ++ extraJavaOpts
+    val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",
+      args, sc.executorEnvs, classPathEntries, libraryPathEntries, javaOpts)
     val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
-      sparkHome, sc.ui.appUIAddress, sc.eventLogger.map(_.logDir))
+      sc.ui.appUIAddress, sc.eventLogger.map(_.logDir))
 
     client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf)
     client.start()
@@ -95,6 +97,7 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int,
     memory: Int) {
+    totalExpectedExecutors.addAndGet(1)
     logInfo("Granted executor ID %s on hostPort %s with %d cores, %s RAM".format(
       fullId, hostPort, cores, Utils.megabytesToString(memory)))
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index 9b95ccca0443e..3d1cf312ccc97 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -23,8 +23,9 @@ import akka.actor.{Actor, ActorRef, Props}
 
 import org.apache.spark.{Logging, SparkEnv, TaskState}
 import org.apache.spark.TaskState.TaskState
-import org.apache.spark.executor.{Executor, ExecutorBackend}
+import org.apache.spark.executor.{TaskMetrics, Executor, ExecutorBackend}
 import org.apache.spark.scheduler.{SchedulerBackend, TaskSchedulerImpl, WorkerOffer}
+import org.apache.spark.storage.BlockManagerId
 
 private case class ReviveOffers()
 
@@ -32,6 +33,8 @@ private case class StatusUpdate(taskId: Long, state: TaskState, serializedData:
 
 private case class KillTask(taskId: Long, interruptThread: Boolean)
 
+private case class StopExecutor()
+
 /**
  * Calls to LocalBackend are all serialized through LocalActor. Using an actor makes the calls on
  * LocalBackend asynchronous, which is necessary to prevent deadlock between LocalBackend
@@ -57,19 +60,22 @@ private[spark] class LocalActor(
     case StatusUpdate(taskId, state, serializedData) =>
       scheduler.statusUpdate(taskId, state, serializedData)
       if (TaskState.isFinished(state)) {
-        freeCores += 1
+        freeCores += scheduler.CPUS_PER_TASK
         reviveOffers()
       }
 
     case KillTask(taskId, interruptThread) =>
       executor.killTask(taskId, interruptThread)
+
+    case StopExecutor =>
+      executor.stop()
   }
 
   def reviveOffers() {
     val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
     for (task <- scheduler.resourceOffers(offers).flatten) {
-      freeCores -= 1
-      executor.launchTask(executorBackend, task.taskId, task.serializedTask)
+      freeCores -= scheduler.CPUS_PER_TASK
+      executor.launchTask(executorBackend, task.taskId, task.name, task.serializedTask)
     }
   }
 }
@@ -91,6 +97,7 @@ private[spark] class LocalBackend(scheduler: TaskSchedulerImpl, val totalCores:
   }
 
   override def stop() {
+    localActor ! StopExecutor
   }
 
   override def reviveOffers() {
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 0a7e1ec539679..a7fa057ee05f7 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -108,7 +108,7 @@ private[spark] class JavaSerializerInstance(counterReset: Int) extends Serialize
  */
 @DeveloperApi
 class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable {
-  private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 10000)
+  private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100)
 
   def newInstance(): SerializerInstance = new JavaSerializerInstance(counterReset)
 
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 1ce4243194798..e60b802a86a14 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -31,6 +31,7 @@ import org.apache.spark.scheduler.MapStatus
 import org.apache.spark.storage._
 import org.apache.spark.storage.{GetBlock, GotBlock, PutBlock}
 import org.apache.spark.util.BoundedPriorityQueue
+import org.apache.spark.util.collection.CompactBuffer
 
 import scala.reflect.ClassTag
 
@@ -47,14 +48,17 @@ class KryoSerializer(conf: SparkConf)
   with Serializable {
 
   private val bufferSize = conf.getInt("spark.kryoserializer.buffer.mb", 2) * 1024 * 1024
+  private val maxBufferSize = conf.getInt("spark.kryoserializer.buffer.max.mb", 64) * 1024 * 1024
   private val referenceTracking = conf.getBoolean("spark.kryo.referenceTracking", true)
+  private val registrationRequired = conf.getBoolean("spark.kryo.registrationRequired", false)
   private val registrator = conf.getOption("spark.kryo.registrator")
 
-  def newKryoOutput() = new KryoOutput(bufferSize)
+  def newKryoOutput() = new KryoOutput(bufferSize, math.max(bufferSize, maxBufferSize))
 
   def newKryo(): Kryo = {
     val instantiator = new EmptyScalaKryoInstantiator
     val kryo = instantiator.newKryo()
+    kryo.setRegistrationRequired(registrationRequired)
     val classLoader = Thread.currentThread.getContextClassLoader
 
     // Allow disabling Kryo reference tracking if user knows their object graphs don't have loops.
@@ -183,9 +187,11 @@ private[serializer] object KryoSerializer {
     classOf[GotBlock],
     classOf[GetBlock],
     classOf[MapStatus],
+    classOf[CompactBuffer[_]],
     classOf[BlockManagerId],
     classOf[Array[Byte]],
-    classOf[BoundedPriorityQueue[_]]
+    classOf[BoundedPriorityQueue[_]],
+    classOf[SparkConf]
   )
 }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
index a932455776e34..99788828981c7 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
@@ -81,10 +81,9 @@ private[hash] object BlockStoreShuffleFetcher extends Logging {
       shuffleMetrics.shuffleFinishTime = System.currentTimeMillis
       shuffleMetrics.fetchWaitTime = blockFetcherItr.fetchWaitTime
       shuffleMetrics.remoteBytesRead = blockFetcherItr.remoteBytesRead
-      shuffleMetrics.totalBlocksFetched = blockFetcherItr.totalBlocks
       shuffleMetrics.localBlocksFetched = blockFetcherItr.numLocalBlocks
       shuffleMetrics.remoteBlocksFetched = blockFetcherItr.numRemoteBlocks
-      context.taskMetrics.shuffleReadMetrics = Some(shuffleMetrics)
+      context.taskMetrics.updateShuffleReadMetrics(shuffleMetrics)
     })
 
     new InterruptibleIterator[T](context, completionIter)
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
index 5b0940ecce29d..df98d18fa8193 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
@@ -24,7 +24,7 @@ import org.apache.spark.shuffle._
  * A ShuffleManager using hashing, that creates one output file per reduce partition on each
  * mapper (possibly reusing these across waves of tasks).
  */
-class HashShuffleManager(conf: SparkConf) extends ShuffleManager {
+private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager {
   /* Register a shuffle with the manager and obtain a handle for it to pass to tasks. */
   override def registerShuffle[K, V, C](
       shuffleId: Int,
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
index d45258c0a492b..7c9dc8e5f88ef 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
@@ -20,8 +20,9 @@ package org.apache.spark.shuffle.hash
 import org.apache.spark.{InterruptibleIterator, TaskContext}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader}
+import org.apache.spark.util.collection.ExternalSorter
 
-class HashShuffleReader[K, C](
+private[spark] class HashShuffleReader[K, C](
     handle: BaseShuffleHandle[K, _, C],
     startPartition: Int,
     endPartition: Int,
@@ -35,10 +36,10 @@ class HashShuffleReader[K, C](
 
   /** Read the combined key-values for this reduce task */
   override def read(): Iterator[Product2[K, C]] = {
-    val iter = BlockStoreShuffleFetcher.fetch(handle.shuffleId, startPartition, context,
-      Serializer.getSerializer(dep.serializer))
+    val ser = Serializer.getSerializer(dep.serializer)
+    val iter = BlockStoreShuffleFetcher.fetch(handle.shuffleId, startPartition, context, ser)
 
-    if (dep.aggregator.isDefined) {
+    val aggregatedIter: Iterator[Product2[K, C]] = if (dep.aggregator.isDefined) {
       if (dep.mapSideCombine) {
         new InterruptibleIterator(context, dep.aggregator.get.combineCombinersByKey(iter, context))
       } else {
@@ -47,7 +48,22 @@ class HashShuffleReader[K, C](
     } else if (dep.aggregator.isEmpty && dep.mapSideCombine) {
       throw new IllegalStateException("Aggregator is empty for map-side combine")
     } else {
-      iter
+      // Convert the Product2s to pairs since this is what downstream RDDs currently expect
+      iter.asInstanceOf[Iterator[Product2[K, C]]].map(pair => (pair._1, pair._2))
+    }
+
+    // Sort the output if there is a sort ordering defined.
+    dep.keyOrdering match {
+      case Some(keyOrd: Ordering[K]) =>
+        // Create an ExternalSorter to sort the data. Note that if spark.shuffle.spill is disabled,
+        // the ExternalSorter won't spill to disk.
+        val sorter = new ExternalSorter[K, C, C](ordering = Some(keyOrd), serializer = Some(ser))
+        sorter.write(aggregatedIter)
+        context.taskMetrics.memoryBytesSpilled += sorter.memoryBytesSpilled
+        context.taskMetrics.diskBytesSpilled += sorter.diskBytesSpilled
+        sorter.iterator
+      case None =>
+        aggregatedIter
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala
index 9b78228519da4..45d3b8b9b8725 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala
@@ -24,7 +24,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.scheduler.MapStatus
 
-class HashShuffleWriter[K, V](
+private[spark] class HashShuffleWriter[K, V](
     handle: BaseShuffleHandle[K, V, _],
     mapId: Int,
     context: TaskContext)
@@ -33,6 +33,10 @@ class HashShuffleWriter[K, V](
   private val dep = handle.dependency
   private val numOutputSplits = dep.partitioner.numPartitions
   private val metrics = context.taskMetrics
+
+  // Are we in the process of stopping? Because map tasks can call stop() with success = true
+  // and then call stop() with success = false if they get an exception, we want to make sure
+  // we don't try deleting files, etc twice.
   private var stopping = false
 
   private val blockManager = SparkEnv.get.blockManager
@@ -61,7 +65,8 @@ class HashShuffleWriter[K, V](
   }
 
   /** Close this writer, passing along whether the map completed */
-  override def stop(success: Boolean): Option[MapStatus] = {
+  override def stop(initiallySuccess: Boolean): Option[MapStatus] = {
+    var success = initiallySuccess
     try {
       if (stopping) {
         return None
@@ -69,15 +74,16 @@ class HashShuffleWriter[K, V](
       stopping = true
       if (success) {
         try {
-          return Some(commitWritesAndBuildStatus())
+          Some(commitWritesAndBuildStatus())
         } catch {
           case e: Exception =>
+            success = false
             revertWrites()
             throw e
         }
       } else {
         revertWrites()
-        return None
+        None
       }
     } finally {
       // Release the writers back to the shuffle block manager.
@@ -96,8 +102,7 @@ class HashShuffleWriter[K, V](
     var totalBytes = 0L
     var totalTime = 0L
     val compressedSizes = shuffle.writers.map { writer: BlockObjectWriter =>
-      writer.commit()
-      writer.close()
+      writer.commitAndClose()
       val size = writer.fileSegment().length
       totalBytes += size
       totalTime += writer.timeWriting()
@@ -116,8 +121,7 @@ class HashShuffleWriter[K, V](
   private def revertWrites(): Unit = {
     if (shuffle != null && shuffle.writers != null) {
       for (writer <- shuffle.writers) {
-        writer.revertPartialWrites()
-        writer.close()
+        writer.revertPartialWritesAndClose()
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
new file mode 100644
index 0000000000000..6dcca47ea7c0c
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort
+
+import java.io.{DataInputStream, FileInputStream}
+
+import org.apache.spark.shuffle._
+import org.apache.spark.{TaskContext, ShuffleDependency}
+import org.apache.spark.shuffle.hash.HashShuffleReader
+import org.apache.spark.storage.{DiskBlockManager, FileSegment, ShuffleBlockId}
+
+private[spark] class SortShuffleManager extends ShuffleManager {
+  /**
+   * Register a shuffle with the manager and obtain a handle for it to pass to tasks.
+   */
+  override def registerShuffle[K, V, C](
+      shuffleId: Int,
+      numMaps: Int,
+      dependency: ShuffleDependency[K, V, C]): ShuffleHandle = {
+    new BaseShuffleHandle(shuffleId, numMaps, dependency)
+  }
+
+  /**
+   * Get a reader for a range of reduce partitions (startPartition to endPartition-1, inclusive).
+   * Called on executors by reduce tasks.
+   */
+  override def getReader[K, C](
+      handle: ShuffleHandle,
+      startPartition: Int,
+      endPartition: Int,
+      context: TaskContext): ShuffleReader[K, C] = {
+    // We currently use the same block store shuffle fetcher as the hash-based shuffle.
+    new HashShuffleReader(
+      handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
+  }
+
+  /** Get a writer for a given partition. Called on executors by map tasks. */
+  override def getWriter[K, V](handle: ShuffleHandle, mapId: Int, context: TaskContext)
+      : ShuffleWriter[K, V] = {
+    new SortShuffleWriter(handle.asInstanceOf[BaseShuffleHandle[K, V, _]], mapId, context)
+  }
+
+  /** Remove a shuffle's metadata from the ShuffleManager. */
+  override def unregisterShuffle(shuffleId: Int): Unit = {}
+
+  /** Shut down this ShuffleManager. */
+  override def stop(): Unit = {}
+
+  /** Get the location of a block in a map output file. Uses the index file we create for it. */
+  def getBlockLocation(blockId: ShuffleBlockId, diskManager: DiskBlockManager): FileSegment = {
+    // The block is actually going to be a range of a single map output file for this map, so
+    // figure out the ID of the consolidated file, then the offset within that from our index
+    val consolidatedId = blockId.copy(reduceId = 0)
+    val indexFile = diskManager.getFile(consolidatedId.name + ".index")
+    val in = new DataInputStream(new FileInputStream(indexFile))
+    try {
+      in.skip(blockId.reduceId * 8)
+      val offset = in.readLong()
+      val nextOffset = in.readLong()
+      new FileSegment(diskManager.getFile(consolidatedId), offset, nextOffset - offset)
+    } finally {
+      in.close()
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
new file mode 100644
index 0000000000000..9a356d0dbaf17
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort
+
+import java.io.{BufferedOutputStream, File, FileOutputStream, DataOutputStream}
+
+import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext}
+import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.scheduler.MapStatus
+import org.apache.spark.serializer.Serializer
+import org.apache.spark.shuffle.{ShuffleWriter, BaseShuffleHandle}
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.util.collection.ExternalSorter
+
+private[spark] class SortShuffleWriter[K, V, C](
+    handle: BaseShuffleHandle[K, V, C],
+    mapId: Int,
+    context: TaskContext)
+  extends ShuffleWriter[K, V] with Logging {
+
+  private val dep = handle.dependency
+  private val numPartitions = dep.partitioner.numPartitions
+
+  private val blockManager = SparkEnv.get.blockManager
+  private val ser = Serializer.getSerializer(dep.serializer.orNull)
+
+  private val conf = SparkEnv.get.conf
+  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
+
+  private var sorter: ExternalSorter[K, V, _] = null
+  private var outputFile: File = null
+
+  // Are we in the process of stopping? Because map tasks can call stop() with success = true
+  // and then call stop() with success = false if they get an exception, we want to make sure
+  // we don't try deleting files, etc twice.
+  private var stopping = false
+
+  private var mapStatus: MapStatus = null
+
+  /** Write a bunch of records to this task's output */
+  override def write(records: Iterator[_ <: Product2[K, V]]): Unit = {
+    // Get an iterator with the elements for each partition ID
+    val partitions: Iterator[(Int, Iterator[Product2[K, _]])] = {
+      if (dep.mapSideCombine) {
+        if (!dep.aggregator.isDefined) {
+          throw new IllegalStateException("Aggregator is empty for map-side combine")
+        }
+        sorter = new ExternalSorter[K, V, C](
+          dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
+        sorter.write(records)
+        sorter.partitionedIterator
+      } else {
+        // In this case we pass neither an aggregator nor an ordering to the sorter, because we
+        // don't care whether the keys get sorted in each partition; that will be done on the
+        // reduce side if the operation being run is sortByKey.
+        sorter = new ExternalSorter[K, V, V](
+          None, Some(dep.partitioner), None, dep.serializer)
+        sorter.write(records)
+        sorter.partitionedIterator
+      }
+    }
+
+    // Create a single shuffle file with reduce ID 0 that we'll write all results to. We'll later
+    // serve different ranges of this file using an index file that we create at the end.
+    val blockId = ShuffleBlockId(dep.shuffleId, mapId, 0)
+    outputFile = blockManager.diskBlockManager.getFile(blockId)
+
+    // Track location of each range in the output file
+    val offsets = new Array[Long](numPartitions + 1)
+    val lengths = new Array[Long](numPartitions)
+
+    // Statistics
+    var totalBytes = 0L
+    var totalTime = 0L
+
+    for ((id, elements) <- partitions) {
+      if (elements.hasNext) {
+        val writer = blockManager.getDiskWriter(blockId, outputFile, ser, fileBufferSize)
+        for (elem <- elements) {
+          writer.write(elem)
+        }
+        writer.commitAndClose()
+        val segment = writer.fileSegment()
+        offsets(id + 1) = segment.offset + segment.length
+        lengths(id) = segment.length
+        totalTime += writer.timeWriting()
+        totalBytes += segment.length
+      } else {
+        // The partition is empty; don't create a new writer to avoid writing headers, etc
+        offsets(id + 1) = offsets(id)
+      }
+    }
+
+    val shuffleMetrics = new ShuffleWriteMetrics
+    shuffleMetrics.shuffleBytesWritten = totalBytes
+    shuffleMetrics.shuffleWriteTime = totalTime
+    context.taskMetrics.shuffleWriteMetrics = Some(shuffleMetrics)
+    context.taskMetrics.memoryBytesSpilled += sorter.memoryBytesSpilled
+    context.taskMetrics.diskBytesSpilled += sorter.diskBytesSpilled
+
+    // Write an index file with the offsets of each block, plus a final offset at the end for the
+    // end of the output file. This will be used by SortShuffleManager.getBlockLocation to figure
+    // out where each block begins and ends.
+
+    val diskBlockManager = blockManager.diskBlockManager
+    val indexFile = diskBlockManager.getFile(blockId.name + ".index")
+    val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile)))
+    try {
+      var i = 0
+      while (i < numPartitions + 1) {
+        out.writeLong(offsets(i))
+        i += 1
+      }
+    } finally {
+      out.close()
+    }
+
+    // Register our map output with the ShuffleBlockManager, which handles cleaning it over time
+    blockManager.shuffleBlockManager.addCompletedMap(dep.shuffleId, mapId, numPartitions)
+
+    mapStatus = new MapStatus(blockManager.blockManagerId,
+      lengths.map(MapOutputTracker.compressSize))
+  }
+
+  /** Close this writer, passing along whether the map completed */
+  override def stop(success: Boolean): Option[MapStatus] = {
+    try {
+      if (stopping) {
+        return None
+      }
+      stopping = true
+      if (success) {
+        return Option(mapStatus)
+      } else {
+        // The map task failed, so delete our output file if we created one
+        if (outputFile != null) {
+          outputFile.delete()
+        }
+        return None
+      }
+    } finally {
+      // Clean up our sorter, which may have its own intermediate files
+      if (sorter != null) {
+        sorter.stop()
+        sorter = null
+      }
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala
index 408a797088059..ccf830e118ee7 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala
@@ -46,7 +46,6 @@ import org.apache.spark.util.Utils
 private[storage]
 trait BlockFetcherIterator extends Iterator[(BlockId, Option[Iterator[Any]])] with Logging {
   def initialize()
-  def totalBlocks: Int
   def numLocalBlocks: Int
   def numRemoteBlocks: Int
   def fetchWaitTime: Long
@@ -180,9 +179,9 @@ object BlockFetcherIterator {
             if (curRequestSize >= targetRequestSize) {
               // Add this FetchRequest
               remoteRequests += new FetchRequest(address, curBlocks)
-              curRequestSize = 0
               curBlocks = new ArrayBuffer[(BlockId, Long)]
               logDebug(s"Creating fetch request of $curRequestSize at $address")
+              curRequestSize = 0
             }
           }
           // Add in the final request
@@ -192,7 +191,7 @@ object BlockFetcherIterator {
         }
       }
       logInfo("Getting " + _numBlocksToFetch + " non-empty blocks out of " +
-        totalBlocks + " blocks")
+        (numLocal + numRemote) + " blocks")
       remoteRequests
     }
 
@@ -201,14 +200,17 @@ object BlockFetcherIterator {
       // these all at once because they will just memory-map some files, so they won't consume
       // any memory that might exceed our maxBytesInFlight
       for (id <- localBlocksToFetch) {
-        getLocalFromDisk(id, serializer) match {
-          case Some(iter) => {
-            // Pass 0 as size since it's not in flight
-            results.put(new FetchResult(id, 0, () => iter))
-            logDebug("Got local block " + id)
-          }
-          case None => {
-            throw new BlockException(id, "Could not get block " + id + " from local machine")
+        try {
+          // getLocalFromDisk never return None but throws BlockException
+          val iter = getLocalFromDisk(id, serializer).get
+          // Pass 0 as size since it's not in flight
+          results.put(new FetchResult(id, 0, () => iter))
+          logDebug("Got local block " + id)
+        } catch {
+          case e: Exception => {
+            logError(s"Error occurred while fetching local blocks", e)
+            results.put(new FetchResult(id, -1, null))
+            return
           }
         }
       }
@@ -235,7 +237,6 @@ object BlockFetcherIterator {
       logDebug("Got local blocks in " + Utils.getUsedTimeMs(startTime) + " ms")
     }
 
-    override def totalBlocks: Int = numLocal + numRemote
     override def numLocalBlocks: Int = numLocal
     override def numRemoteBlocks: Int = numRemote
     override def fetchWaitTime: Long = _fetchWaitTime
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
index 42ec181b00bb3..c1756ac905417 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockId.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
@@ -54,11 +54,15 @@ case class RDDBlockId(rddId: Int, splitIndex: Int) extends BlockId {
 }
 
 @DeveloperApi
-case class ShuffleBlockId(shuffleId: Int, mapId: Int, reduceId: Int)
-  extends BlockId {
+case class ShuffleBlockId(shuffleId: Int, mapId: Int, reduceId: Int) extends BlockId {
   def name = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId
 }
 
+@DeveloperApi
+case class ShuffleIndexBlockId(shuffleId: Int, mapId: Int, reduceId: Int) extends BlockId {
+  def name = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId + ".index"
+}
+
 @DeveloperApi
 case class BroadcastBlockId(broadcastId: Long, field: String = "") extends BlockId {
   def name = "broadcast_" + broadcastId + (if (field == "") "" else "_" + field)
@@ -88,6 +92,7 @@ private[spark] case class TestBlockId(id: String) extends BlockId {
 object BlockId {
   val RDD = "rdd_([0-9]+)_([0-9]+)".r
   val SHUFFLE = "shuffle_([0-9]+)_([0-9]+)_([0-9]+)".r
+  val SHUFFLE_INDEX = "shuffle_([0-9]+)_([0-9]+)_([0-9]+).index".r
   val BROADCAST = "broadcast_([0-9]+)([_A-Za-z0-9]*)".r
   val TASKRESULT = "taskresult_([0-9]+)".r
   val STREAM = "input-([0-9]+)-([0-9]+)".r
@@ -99,6 +104,8 @@ object BlockId {
       RDDBlockId(rddId.toInt, splitIndex.toInt)
     case SHUFFLE(shuffleId, mapId, reduceId) =>
       ShuffleBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
+    case SHUFFLE_INDEX(shuffleId, mapId, reduceId) =>
+      ShuffleIndexBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
     case BROADCAST(broadcastId, field) =>
       BroadcastBlockId(broadcastId.toLong, field.stripPrefix("_"))
     case TASKRESULT(taskId) =>
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 0db0a5bc7341b..c0a06017945f0 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util._
 private[spark] sealed trait BlockValues
 private[spark] case class ByteBufferValues(buffer: ByteBuffer) extends BlockValues
 private[spark] case class IteratorValues(iterator: Iterator[Any]) extends BlockValues
-private[spark] case class ArrayBufferValues(buffer: ArrayBuffer[Any]) extends BlockValues
+private[spark] case class ArrayValues(buffer: Array[Any]) extends BlockValues
 
 /* Class for returning a fetched block and associated metrics. */
 private[spark] class BlockResult(
@@ -71,9 +71,9 @@ private[spark] class BlockManager(
 
   // Actual storage of where blocks are kept
   private var tachyonInitialized = false
-  private[storage] val memoryStore = new MemoryStore(this, maxMemory)
-  private[storage] val diskStore = new DiskStore(this, diskBlockManager)
-  private[storage] lazy val tachyonStore: TachyonStore = {
+  private[spark] val memoryStore = new MemoryStore(this, maxMemory)
+  private[spark] val diskStore = new DiskStore(this, diskBlockManager)
+  private[spark] lazy val tachyonStore: TachyonStore = {
     val storeDir = conf.get("spark.tachyonStore.baseDir", "/tmp_spark_tachyon")
     val appFolderName = conf.get("spark.tachyonStore.folderName")
     val tachyonStorePath = s"$storeDir/$appFolderName/${this.executorId}"
@@ -116,15 +116,6 @@ private[spark] class BlockManager(
   private var asyncReregisterTask: Future[Unit] = null
   private val asyncReregisterLock = new Object
 
-  private def heartBeat(): Unit = {
-    if (!master.sendHeartBeat(blockManagerId)) {
-      reregister()
-    }
-  }
-
-  private val heartBeatFrequency = BlockManager.getHeartBeatFrequency(conf)
-  private var heartBeatTask: Cancellable = null
-
   private val metadataCleaner = new MetadataCleaner(
     MetadataCleanerType.BLOCK_MANAGER, this.dropOldNonBroadcastBlocks, conf)
   private val broadcastCleaner = new MetadataCleaner(
@@ -161,11 +152,6 @@ private[spark] class BlockManager(
   private def initialize(): Unit = {
     master.registerBlockManager(blockManagerId, maxMemory, slaveActor)
     BlockManagerWorker.startBlockManagerWorker(this)
-    if (!BlockManager.getDisableHeartBeatsForTesting(conf)) {
-      heartBeatTask = actorSystem.scheduler.schedule(0.seconds, heartBeatFrequency.milliseconds) {
-        Utils.tryOrExit { heartBeat() }
-      }
-    }
   }
 
   /**
@@ -195,7 +181,7 @@ private[spark] class BlockManager(
    *
    * Note that this method must be called without any BlockInfo locks held.
    */
-  private def reregister(): Unit = {
+  def reregister(): Unit = {
     // TODO: We might need to rate limit re-registering.
     logInfo("BlockManager re-registering with master")
     master.registerBlockManager(blockManagerId, maxMemory, slaveActor)
@@ -463,16 +449,17 @@ private[spark] class BlockManager(
               val values = dataDeserialize(blockId, bytes)
               if (level.deserialized) {
                 // Cache the values before returning them
-                // TODO: Consider creating a putValues that also takes in a iterator?
-                val valuesBuffer = new ArrayBuffer[Any]
-                valuesBuffer ++= values
-                memoryStore.putValues(blockId, valuesBuffer, level, returnValues = true).data
-                  match {
-                    case Left(values2) =>
-                      return Some(new BlockResult(values2, DataReadMethod.Disk, info.size))
-                    case _ =>
-                      throw new SparkException("Memory store did not return back an iterator")
-                  }
+                val putResult = memoryStore.putIterator(
+                  blockId, values, level, returnValues = true, allowPersistToDisk = false)
+                // The put may or may not have succeeded, depending on whether there was enough
+                // space to unroll the block. Either way, the put here should return an iterator.
+                putResult.data match {
+                  case Left(it) =>
+                    return Some(new BlockResult(it, DataReadMethod.Disk, info.size))
+                  case _ =>
+                    // This only happens if we dropped the values back to disk (which is never)
+                    throw new SparkException("Memory store did not return an iterator!")
+                }
               } else {
                 return Some(new BlockResult(values, DataReadMethod.Disk, info.size))
               }
@@ -561,13 +548,14 @@ private[spark] class BlockManager(
     iter
   }
 
-  def put(
+  def putIterator(
       blockId: BlockId,
       values: Iterator[Any],
       level: StorageLevel,
-      tellMaster: Boolean): Seq[(BlockId, BlockStatus)] = {
+      tellMaster: Boolean = true,
+      effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
     require(values != null, "Values is null")
-    doPut(blockId, IteratorValues(values), level, tellMaster)
+    doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
   }
 
   /**
@@ -589,13 +577,14 @@ private[spark] class BlockManager(
    * Put a new block of values to the block manager.
    * Return a list of blocks updated as a result of this put.
    */
-  def put(
+  def putArray(
       blockId: BlockId,
-      values: ArrayBuffer[Any],
+      values: Array[Any],
       level: StorageLevel,
-      tellMaster: Boolean = true): Seq[(BlockId, BlockStatus)] = {
+      tellMaster: Boolean = true,
+      effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
     require(values != null, "Values is null")
-    doPut(blockId, ArrayBufferValues(values), level, tellMaster)
+    doPut(blockId, ArrayValues(values), level, tellMaster, effectiveStorageLevel)
   }
 
   /**
@@ -606,19 +595,33 @@ private[spark] class BlockManager(
       blockId: BlockId,
       bytes: ByteBuffer,
       level: StorageLevel,
-      tellMaster: Boolean = true): Seq[(BlockId, BlockStatus)] = {
+      tellMaster: Boolean = true,
+      effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
     require(bytes != null, "Bytes is null")
-    doPut(blockId, ByteBufferValues(bytes), level, tellMaster)
+    doPut(blockId, ByteBufferValues(bytes), level, tellMaster, effectiveStorageLevel)
   }
 
+  /**
+   * Put the given block according to the given level in one of the block stores, replicating
+   * the values if necessary.
+   *
+   * The effective storage level refers to the level according to which the block will actually be
+   * handled. This allows the caller to specify an alternate behavior of doPut while preserving
+   * the original level specified by the user.
+   */
   private def doPut(
       blockId: BlockId,
       data: BlockValues,
       level: StorageLevel,
-      tellMaster: Boolean = true): Seq[(BlockId, BlockStatus)] = {
+      tellMaster: Boolean = true,
+      effectiveStorageLevel: Option[StorageLevel] = None)
+    : Seq[(BlockId, BlockStatus)] = {
 
     require(blockId != null, "BlockId is null")
     require(level != null && level.isValid, "StorageLevel is null or invalid")
+    effectiveStorageLevel.foreach { level =>
+      require(level != null && level.isValid, "Effective StorageLevel is null or invalid")
+    }
 
     // Return value
     val updatedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
@@ -657,13 +660,16 @@ private[spark] class BlockManager(
     // Size of the block in bytes
     var size = 0L
 
+    // The level we actually use to put the block
+    val putLevel = effectiveStorageLevel.getOrElse(level)
+
     // If we're storing bytes, then initiate the replication before storing them locally.
     // This is faster as data is already serialized and ready to send.
     val replicationFuture = data match {
-      case b: ByteBufferValues if level.replication > 1 =>
+      case b: ByteBufferValues if putLevel.replication > 1 =>
         // Duplicate doesn't copy the bytes, but just creates a wrapper
         val bufferView = b.buffer.duplicate()
-        Future { replicate(blockId, bufferView, level) }
+        Future { replicate(blockId, bufferView, putLevel) }
       case _ => null
     }
 
@@ -676,18 +682,18 @@ private[spark] class BlockManager(
         // returnValues - Whether to return the values put
         // blockStore - The type of storage to put these values into
         val (returnValues, blockStore: BlockStore) = {
-          if (level.useMemory) {
+          if (putLevel.useMemory) {
             // Put it in memory first, even if it also has useDisk set to true;
             // We will drop it to disk later if the memory store can't hold it.
             (true, memoryStore)
-          } else if (level.useOffHeap) {
+          } else if (putLevel.useOffHeap) {
             // Use tachyon for off-heap storage
             (false, tachyonStore)
-          } else if (level.useDisk) {
+          } else if (putLevel.useDisk) {
             // Don't get back the bytes from put unless we replicate them
-            (level.replication > 1, diskStore)
+            (putLevel.replication > 1, diskStore)
           } else {
-            assert(level == StorageLevel.NONE)
+            assert(putLevel == StorageLevel.NONE)
             throw new BlockException(
               blockId, s"Attempted to put block $blockId without specifying storage level!")
           }
@@ -696,22 +702,22 @@ private[spark] class BlockManager(
         // Actually put the values
         val result = data match {
           case IteratorValues(iterator) =>
-            blockStore.putValues(blockId, iterator, level, returnValues)
-          case ArrayBufferValues(array) =>
-            blockStore.putValues(blockId, array, level, returnValues)
+            blockStore.putIterator(blockId, iterator, putLevel, returnValues)
+          case ArrayValues(array) =>
+            blockStore.putArray(blockId, array, putLevel, returnValues)
           case ByteBufferValues(bytes) =>
             bytes.rewind()
-            blockStore.putBytes(blockId, bytes, level)
+            blockStore.putBytes(blockId, bytes, putLevel)
         }
         size = result.size
         result.data match {
-          case Left (newIterator) if level.useMemory => valuesAfterPut = newIterator
+          case Left (newIterator) if putLevel.useMemory => valuesAfterPut = newIterator
           case Right (newBytes) => bytesAfterPut = newBytes
           case _ =>
         }
 
         // Keep track of which blocks are dropped from memory
-        if (level.useMemory) {
+        if (putLevel.useMemory) {
           result.droppedBlocks.foreach { updatedBlocks += _ }
         }
 
@@ -742,7 +748,7 @@ private[spark] class BlockManager(
 
     // Either we're storing bytes and we asynchronously started replication, or we're storing
     // values and need to serialize and replicate them now:
-    if (level.replication > 1) {
+    if (putLevel.replication > 1) {
       data match {
         case ByteBufferValues(bytes) =>
           if (replicationFuture != null) {
@@ -758,7 +764,7 @@ private[spark] class BlockManager(
             }
             bytesAfterPut = dataSerialize(blockId, valuesAfterPut)
           }
-          replicate(blockId, bytesAfterPut, level)
+          replicate(blockId, bytesAfterPut, putLevel)
           logDebug("Put block %s remotely took %s"
             .format(blockId, Utils.getUsedTimeMs(remoteStartTime)))
       }
@@ -766,7 +772,7 @@ private[spark] class BlockManager(
 
     BlockManager.dispose(bytesAfterPut)
 
-    if (level.replication > 1) {
+    if (putLevel.replication > 1) {
       logDebug("Putting block %s with replication took %s"
         .format(blockId, Utils.getUsedTimeMs(startTimeMs)))
     } else {
@@ -818,7 +824,7 @@ private[spark] class BlockManager(
       value: Any,
       level: StorageLevel,
       tellMaster: Boolean = true): Seq[(BlockId, BlockStatus)] = {
-    put(blockId, Iterator(value), level, tellMaster)
+    putIterator(blockId, Iterator(value), level, tellMaster)
   }
 
   /**
@@ -829,7 +835,7 @@ private[spark] class BlockManager(
    */
   def dropFromMemory(
       blockId: BlockId,
-      data: Either[ArrayBuffer[Any], ByteBuffer]): Option[BlockStatus] = {
+      data: Either[Array[Any], ByteBuffer]): Option[BlockStatus] = {
 
     logInfo(s"Dropping block $blockId from memory")
     val info = blockInfo.get(blockId).orNull
@@ -853,7 +859,7 @@ private[spark] class BlockManager(
           logInfo(s"Writing block $blockId to disk")
           data match {
             case Left(elements) =>
-              diskStore.putValues(blockId, elements, level, returnValues = false)
+              diskStore.putArray(blockId, elements, level, returnValues = false)
             case Right(bytes) =>
               diskStore.putBytes(blockId, bytes, level)
           }
@@ -1045,9 +1051,6 @@ private[spark] class BlockManager(
   }
 
   def stop(): Unit = {
-    if (heartBeatTask != null) {
-      heartBeatTask.cancel()
-    }
     connectionManager.stop()
     shuffleBlockManager.stop()
     diskBlockManager.stop()
@@ -1068,17 +1071,13 @@ private[spark] class BlockManager(
 private[spark] object BlockManager extends Logging {
   private val ID_GENERATOR = new IdGenerator
 
+  /** Return the total amount of storage memory available. */
   private def getMaxMemory(conf: SparkConf): Long = {
     val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
-    (Runtime.getRuntime.maxMemory * memoryFraction).toLong
+    val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
+    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
   }
 
-  def getHeartBeatFrequency(conf: SparkConf): Long =
-    conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60000) / 4
-
-  def getDisableHeartBeatsForTesting(conf: SparkConf): Boolean =
-    conf.getBoolean("spark.test.disableBlockManagerHeartBeat", false)
-
   /**
    * Attempt to clean up a ByteBuffer if it is memory-mapped. This uses an *unsafe* Sun API that
    * might cause errors if one attempts to read from the unmapped buffer, but it's better than
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
index 7897fade2df2b..669307765d1fa 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
@@ -21,7 +21,6 @@ import scala.concurrent.{Await, Future}
 import scala.concurrent.ExecutionContext.Implicits.global
 
 import akka.actor._
-import akka.pattern.ask
 
 import org.apache.spark.{Logging, SparkConf, SparkException}
 import org.apache.spark.storage.BlockManagerMessages._
@@ -29,8 +28,8 @@ import org.apache.spark.util.AkkaUtils
 
 private[spark]
 class BlockManagerMaster(var driverActor: ActorRef, conf: SparkConf) extends Logging {
-  val AKKA_RETRY_ATTEMPTS: Int = conf.getInt("spark.akka.num.retries", 3)
-  val AKKA_RETRY_INTERVAL_MS: Int = conf.getInt("spark.akka.retry.wait", 3000)
+  private val AKKA_RETRY_ATTEMPTS: Int = AkkaUtils.numRetries(conf)
+  private val AKKA_RETRY_INTERVAL_MS: Int = AkkaUtils.retryWaitMs(conf)
 
   val DRIVER_AKKA_ACTOR_NAME = "BlockManagerMaster"
 
@@ -42,15 +41,6 @@ class BlockManagerMaster(var driverActor: ActorRef, conf: SparkConf) extends Log
     logInfo("Removed " + execId + " successfully in removeExecutor")
   }
 
-  /**
-   * Send the driver actor a heart beat from the slave. Returns true if everything works out,
-   * false if the driver does not know about the given block manager, which means the block
-   * manager should re-register.
-   */
-  def sendHeartBeat(blockManagerId: BlockManagerId): Boolean = {
-    askDriverWithReply[Boolean](HeartBeat(blockManagerId))
-  }
-
   /** Register the BlockManager's id with the driver. */
   def registerBlockManager(blockManagerId: BlockManagerId, maxMemSize: Long, slaveActor: ActorRef) {
     logInfo("Trying to register BlockManager")
@@ -223,33 +213,8 @@ class BlockManagerMaster(var driverActor: ActorRef, conf: SparkConf) extends Log
    * throw a SparkException if this fails.
    */
   private def askDriverWithReply[T](message: Any): T = {
-    // TODO: Consider removing multiple attempts
-    if (driverActor == null) {
-      throw new SparkException("Error sending message to BlockManager as driverActor is null " +
-        "[message = " + message + "]")
-    }
-    var attempts = 0
-    var lastException: Exception = null
-    while (attempts < AKKA_RETRY_ATTEMPTS) {
-      attempts += 1
-      try {
-        val future = driverActor.ask(message)(timeout)
-        val result = Await.result(future, timeout)
-        if (result == null) {
-          throw new SparkException("BlockManagerMaster returned null")
-        }
-        return result.asInstanceOf[T]
-      } catch {
-        case ie: InterruptedException => throw ie
-        case e: Exception =>
-          lastException = e
-          logWarning("Error sending message to BlockManagerMaster in " + attempts + " attempts", e)
-      }
-      Thread.sleep(AKKA_RETRY_INTERVAL_MS)
-    }
-
-    throw new SparkException(
-      "Error sending message to BlockManagerMaster [message = " + message + "]", lastException)
+    AkkaUtils.askWithReply(message, driverActor, AKKA_RETRY_ATTEMPTS, AKKA_RETRY_INTERVAL_MS,
+      timeout)
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
index 6aed322eeb185..bd31e3c5a187f 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
@@ -52,25 +52,24 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
 
   private val akkaTimeout = AkkaUtils.askTimeout(conf)
 
-  val slaveTimeout = conf.get("spark.storage.blockManagerSlaveTimeoutMs",
-    "" + (BlockManager.getHeartBeatFrequency(conf) * 3)).toLong
+  val slaveTimeout = conf.getLong("spark.storage.blockManagerSlaveTimeoutMs",
+    math.max(conf.getInt("spark.executor.heartbeatInterval", 10000) * 3, 45000))
 
-  val checkTimeoutInterval = conf.get("spark.storage.blockManagerTimeoutIntervalMs",
-    "60000").toLong
+  val checkTimeoutInterval = conf.getLong("spark.storage.blockManagerTimeoutIntervalMs",
+    60000)
 
   var timeoutCheckingTask: Cancellable = null
 
   override def preStart() {
-    if (!BlockManager.getDisableHeartBeatsForTesting(conf)) {
-      import context.dispatcher
-      timeoutCheckingTask = context.system.scheduler.schedule(0.seconds,
-        checkTimeoutInterval.milliseconds, self, ExpireDeadHosts)
-    }
+    import context.dispatcher
+    timeoutCheckingTask = context.system.scheduler.schedule(0.seconds,
+      checkTimeoutInterval.milliseconds, self, ExpireDeadHosts)
     super.preStart()
   }
 
   def receive = {
     case RegisterBlockManager(blockManagerId, maxMemSize, slaveActor) =>
+      logInfo("received a register")
       register(blockManagerId, maxMemSize, slaveActor)
       sender ! true
 
@@ -129,8 +128,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
     case ExpireDeadHosts =>
       expireDeadHosts()
 
-    case HeartBeat(blockManagerId) =>
-      sender ! heartBeat(blockManagerId)
+    case BlockManagerHeartbeat(blockManagerId) =>
+      sender ! heartbeatReceived(blockManagerId)
 
     case other =>
       logWarning("Got unknown message: " + other)
@@ -216,7 +215,7 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
     val minSeenTime = now - slaveTimeout
     val toRemove = new mutable.HashSet[BlockManagerId]
     for (info <- blockManagerInfo.values) {
-      if (info.lastSeenMs < minSeenTime) {
+      if (info.lastSeenMs < minSeenTime && info.blockManagerId.executorId != "<driver>") {
         logWarning("Removing BlockManager " + info.blockManagerId + " with no recent heart beats: "
           + (now - info.lastSeenMs) + "ms exceeds " + slaveTimeout + "ms")
         toRemove += info.blockManagerId
@@ -230,7 +229,11 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
     blockManagerIdByExecutor.get(execId).foreach(removeBlockManager)
   }
 
-  private def heartBeat(blockManagerId: BlockManagerId): Boolean = {
+  /**
+   * Return true if the driver knows about the given block manager. Otherwise, return false,
+   * indicating that the block manager should re-register.
+   */
+  private def heartbeatReceived(blockManagerId: BlockManagerId): Boolean = {
     if (!blockManagerInfo.contains(blockManagerId)) {
       blockManagerId.executorId == "<driver>" && !isLocal
     } else {
@@ -264,9 +267,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
   }
 
   private def storageStatus: Array[StorageStatus] = {
-    blockManagerInfo.map { case(blockManagerId, info) =>
-      val blockMap = mutable.Map[BlockId, BlockStatus](info.blocks.toSeq: _*)
-      new StorageStatus(blockManagerId, info.maxMem, blockMap)
+    blockManagerInfo.map { case (blockManagerId, info) =>
+      new StorageStatus(blockManagerId, info.maxMem, info.blocks)
     }.toArray
   }
 
@@ -336,6 +338,10 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
         case None =>
           blockManagerIdByExecutor(id.executorId) = id
       }
+
+      logInfo("Registering block manager %s with %s RAM".format(
+        id.hostPort, Utils.bytesToString(maxMemSize)))
+
       blockManagerInfo(id) =
         new BlockManagerInfo(id, System.currentTimeMillis(), maxMemSize, slaveActor)
     }
@@ -417,7 +423,14 @@ case class BlockStatus(
     storageLevel: StorageLevel,
     memSize: Long,
     diskSize: Long,
-    tachyonSize: Long)
+    tachyonSize: Long) {
+  def isCached: Boolean = memSize + diskSize + tachyonSize > 0
+}
+
+@DeveloperApi
+object BlockStatus {
+  def empty: BlockStatus = BlockStatus(StorageLevel.NONE, 0L, 0L, 0L)
+}
 
 private[spark] class BlockManagerInfo(
     val blockManagerId: BlockManagerId,
@@ -432,9 +445,6 @@ private[spark] class BlockManagerInfo(
   // Mapping from block id to its status.
   private val _blocks = new JHashMap[BlockId, BlockStatus]
 
-  logInfo("Registering block manager %s with %s RAM".format(
-    blockManagerId.hostPort, Utils.bytesToString(maxMem)))
-
   def getStatus(blockId: BlockId) = Option(_blocks.get(blockId))
 
   def updateLastSeenMs() {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 2b53bf33b5fba..10b65286fb7db 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -21,7 +21,7 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput}
 
 import akka.actor.ActorRef
 
-private[storage] object BlockManagerMessages {
+private[spark] object BlockManagerMessages {
   //////////////////////////////////////////////////////////////////////////////////
   // Messages from the master to slaves.
   //////////////////////////////////////////////////////////////////////////////////
@@ -53,8 +53,6 @@ private[storage] object BlockManagerMessages {
       sender: ActorRef)
     extends ToBlockManagerMaster
 
-  case class HeartBeat(blockManagerId: BlockManagerId) extends ToBlockManagerMaster
-
   class UpdateBlockInfo(
       var blockManagerId: BlockManagerId,
       var blockId: BlockId,
@@ -124,5 +122,7 @@ private[storage] object BlockManagerMessages {
   case class GetMatchingBlockIds(filter: BlockId => Boolean, askSlaves: Boolean = true)
     extends ToBlockManagerMaster
 
+  case class BlockManagerHeartbeat(blockManagerId: BlockManagerId) extends ToBlockManagerMaster
+
   case object ExpireDeadHosts extends ToBlockManagerMaster
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
index 687586490abfe..3f14c40ec61cb 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
@@ -30,7 +30,7 @@ private[spark] class BlockManagerSource(val blockManager: BlockManager, sc: Spar
   metricRegistry.register(MetricRegistry.name("memory", "maxMem_MB"), new Gauge[Long] {
     override def getValue: Long = {
       val storageStatusList = blockManager.master.getStorageStatus
-      val maxMem = storageStatusList.map(_.maxMem).reduce(_ + _)
+      val maxMem = storageStatusList.map(_.maxMem).sum
       maxMem / 1024 / 1024
     }
   })
@@ -38,7 +38,7 @@ private[spark] class BlockManagerSource(val blockManager: BlockManager, sc: Spar
   metricRegistry.register(MetricRegistry.name("memory", "remainingMem_MB"), new Gauge[Long] {
     override def getValue: Long = {
       val storageStatusList = blockManager.master.getStorageStatus
-      val remainingMem = storageStatusList.map(_.memRemaining).reduce(_ + _)
+      val remainingMem = storageStatusList.map(_.memRemaining).sum
       remainingMem / 1024 / 1024
     }
   })
@@ -46,20 +46,15 @@ private[spark] class BlockManagerSource(val blockManager: BlockManager, sc: Spar
   metricRegistry.register(MetricRegistry.name("memory", "memUsed_MB"), new Gauge[Long] {
     override def getValue: Long = {
       val storageStatusList = blockManager.master.getStorageStatus
-      val maxMem = storageStatusList.map(_.maxMem).reduce(_ + _)
-      val remainingMem = storageStatusList.map(_.memRemaining).reduce(_ + _)
-      (maxMem - remainingMem) / 1024 / 1024
+      val memUsed = storageStatusList.map(_.memUsed).sum
+      memUsed / 1024 / 1024
     }
   })
 
   metricRegistry.register(MetricRegistry.name("disk", "diskSpaceUsed_MB"), new Gauge[Long] {
     override def getValue: Long = {
       val storageStatusList = blockManager.master.getStorageStatus
-      val diskSpaceUsed = storageStatusList
-        .flatMap(_.blocks.values.map(_.diskSize))
-        .reduceOption(_ + _)
-        .getOrElse(0L)
-
+      val diskSpaceUsed = storageStatusList.map(_.diskUsed).sum
       diskSpaceUsed / 1024 / 1024
     }
   })
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
index a2687e6be4e34..01d46e1ffc960 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
@@ -39,16 +39,16 @@ private[spark] abstract class BlockObjectWriter(val blockId: BlockId) {
   def isOpen: Boolean
 
   /**
-   * Flush the partial writes and commit them as a single atomic block. Return the
-   * number of bytes written for this commit.
+   * Flush the partial writes and commit them as a single atomic block.
    */
-  def commit(): Long
+  def commitAndClose(): Unit
 
   /**
    * Reverts writes that haven't been flushed yet. Callers should invoke this function
-   * when there are runtime exceptions.
+   * when there are runtime exceptions. This method will not throw, though it may be
+   * unsuccessful in truncating written data.
    */
-  def revertPartialWrites()
+  def revertPartialWritesAndClose()
 
   /**
    * Writes an object.
@@ -57,6 +57,7 @@ private[spark] abstract class BlockObjectWriter(val blockId: BlockId) {
 
   /**
    * Returns the file segment of committed data that this Writer has written.
+   * This is only valid after commitAndClose() has been called.
    */
   def fileSegment(): FileSegment
 
@@ -108,7 +109,7 @@ private[spark] class DiskBlockObjectWriter(
   private var ts: TimeTrackingOutputStream = null
   private var objOut: SerializationStream = null
   private val initialPosition = file.length()
-  private var lastValidPosition = initialPosition
+  private var finalPosition: Long = -1
   private var initialized = false
   private var _timeWriting = 0L
 
@@ -116,7 +117,6 @@ private[spark] class DiskBlockObjectWriter(
     fos = new FileOutputStream(file, true)
     ts = new TimeTrackingOutputStream(fos)
     channel = fos.getChannel()
-    lastValidPosition = initialPosition
     bs = compressStream(new BufferedOutputStream(ts, bufferSize))
     objOut = serializer.newInstance().serializeStream(bs)
     initialized = true
@@ -147,28 +147,36 @@ private[spark] class DiskBlockObjectWriter(
 
   override def isOpen: Boolean = objOut != null
 
-  override def commit(): Long = {
+  override def commitAndClose(): Unit = {
     if (initialized) {
       // NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the
       //       serializer stream and the lower level stream.
       objOut.flush()
       bs.flush()
-      val prevPos = lastValidPosition
-      lastValidPosition = channel.position()
-      lastValidPosition - prevPos
-    } else {
-      // lastValidPosition is zero if stream is uninitialized
-      lastValidPosition
+      close()
     }
+    finalPosition = file.length()
   }
 
-  override def revertPartialWrites() {
-    if (initialized) {
-      // Discard current writes. We do this by flushing the outstanding writes and
-      // truncate the file to the last valid position.
-      objOut.flush()
-      bs.flush()
-      channel.truncate(lastValidPosition)
+  // Discard current writes. We do this by flushing the outstanding writes and then
+  // truncating the file to its initial position.
+  override def revertPartialWritesAndClose() {
+    try {
+      if (initialized) {
+        objOut.flush()
+        bs.flush()
+        close()
+      }
+
+      val truncateStream = new FileOutputStream(file, true)
+      try {
+        truncateStream.getChannel.truncate(initialPosition)
+      } finally {
+        truncateStream.close()
+      }
+    } catch {
+      case e: Exception =>
+        logError("Uncaught exception while reverting partial writes to file " + file, e)
     }
   }
 
@@ -188,6 +196,7 @@ private[spark] class DiskBlockObjectWriter(
 
   // Only valid if called after commit()
   override def bytesWritten: Long = {
-    lastValidPosition - initialPosition
+    assert(finalPosition != -1, "bytesWritten is only valid after successful commit()")
+    finalPosition - initialPosition
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockStore.scala b/core/src/main/scala/org/apache/spark/storage/BlockStore.scala
index b9b53b1a2f118..69985c9759e2d 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockStore.scala
@@ -37,15 +37,15 @@ private[spark] abstract class BlockStore(val blockManager: BlockManager) extends
    * @return a PutResult that contains the size of the data, as well as the values put if
    *         returnValues is true (if not, the result's data field can be null)
    */
-  def putValues(
+  def putIterator(
     blockId: BlockId,
     values: Iterator[Any],
     level: StorageLevel,
     returnValues: Boolean): PutResult
 
-  def putValues(
+  def putArray(
     blockId: BlockId,
-    values: ArrayBuffer[Any],
+    values: Array[Any],
     level: StorageLevel,
     returnValues: Boolean): PutResult
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 673fc19c060a4..4d66ccea211fa 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -21,10 +21,11 @@ import java.io.File
 import java.text.SimpleDateFormat
 import java.util.{Date, Random, UUID}
 
-import org.apache.spark.Logging
+import org.apache.spark.{SparkEnv, Logging}
 import org.apache.spark.executor.ExecutorExitCode
 import org.apache.spark.network.netty.{PathResolver, ShuffleSender}
 import org.apache.spark.util.Utils
+import org.apache.spark.shuffle.sort.SortShuffleManager
 
 /**
  * Creates and maintains the logical mapping between logical blocks and physical on-disk
@@ -34,16 +35,18 @@ import org.apache.spark.util.Utils
  *
  * @param rootDirs The directories to use for storing block files. Data will be hashed among these.
  */
-private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootDirs: String)
+private[spark] class DiskBlockManager(shuffleBlockManager: ShuffleBlockManager, rootDirs: String)
   extends PathResolver with Logging {
 
   private val MAX_DIR_CREATION_ATTEMPTS: Int = 10
-  private val subDirsPerLocalDir = shuffleManager.conf.getInt("spark.diskStore.subDirectories", 64)
+
+  private val subDirsPerLocalDir =
+    shuffleBlockManager.conf.getInt("spark.diskStore.subDirectories", 64)
 
   /* Create one local directory for each path mentioned in spark.local.dir; then, inside this
    * directory, create multiple subdirectories that we will hash files into, in order to avoid
    * having really large inodes at the top level. */
-  private val localDirs: Array[File] = createLocalDirs()
+  val localDirs: Array[File] = createLocalDirs()
   if (localDirs.isEmpty) {
     logError("Failed to create any local dir.")
     System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
@@ -54,13 +57,19 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
   addShutdownHook()
 
   /**
-   * Returns the physical file segment in which the given BlockId is located.
-   * If the BlockId has been mapped to a specific FileSegment, that will be returned.
-   * Otherwise, we assume the Block is mapped to a whole file identified by the BlockId directly.
+   * Returns the physical file segment in which the given BlockId is located. If the BlockId has
+   * been mapped to a specific FileSegment by the shuffle layer, that will be returned.
+   * Otherwise, we assume the Block is mapped to the whole file identified by the BlockId.
    */
   def getBlockLocation(blockId: BlockId): FileSegment = {
-    if (blockId.isShuffle && shuffleManager.consolidateShuffleFiles) {
-      shuffleManager.getBlockLocation(blockId.asInstanceOf[ShuffleBlockId])
+    val env = SparkEnv.get  // NOTE: can be null in unit tests
+    if (blockId.isShuffle && env != null && env.shuffleManager.isInstanceOf[SortShuffleManager]) {
+      // For sort-based shuffle, let it figure out its blocks
+      val sortShuffleManager = env.shuffleManager.asInstanceOf[SortShuffleManager]
+      sortShuffleManager.getBlockLocation(blockId.asInstanceOf[ShuffleBlockId], this)
+    } else if (blockId.isShuffle && shuffleBlockManager.consolidateShuffleFiles) {
+      // For hash-based shuffle with consolidated files, ShuffleBlockManager takes care of this
+      shuffleBlockManager.getBlockLocation(blockId.asInstanceOf[ShuffleBlockId])
     } else {
       val file = getFile(blockId.name)
       new FileSegment(file, 0, file.length())
@@ -99,13 +108,18 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
     getBlockLocation(blockId).file.exists()
   }
 
-  /** List all the blocks currently stored on disk by the disk manager. */
-  def getAllBlocks(): Seq[BlockId] = {
+  /** List all the files currently stored on disk by the disk manager. */
+  def getAllFiles(): Seq[File] = {
     // Get all the files inside the array of array of directories
     subDirs.flatten.filter(_ != null).flatMap { dir =>
-      val files = dir.list()
+      val files = dir.listFiles()
       if (files != null) files else Seq.empty
-    }.map(BlockId.apply)
+    }
+  }
+
+  /** List all the blocks currently stored on disk by the disk manager. */
+  def getAllBlocks(): Seq[BlockId] = {
+    getAllFiles().map(f => BlockId(f.getName))
   }
 
   /** Produces a unique block id and File suitable for intermediate results. */
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index ebff0cb5ba153..c83261dd91b36 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -21,8 +21,6 @@ import java.io.{FileOutputStream, RandomAccessFile}
 import java.nio.ByteBuffer
 import java.nio.channels.FileChannel.MapMode
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.spark.Logging
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.util.Utils
@@ -30,7 +28,7 @@ import org.apache.spark.util.Utils
 /**
  * Stores BlockManager blocks on disk.
  */
-private class DiskStore(blockManager: BlockManager, diskManager: DiskBlockManager)
+private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBlockManager)
   extends BlockStore(blockManager) with Logging {
 
   val minMemoryMapBytes = blockManager.conf.getLong("spark.storage.memoryMapThreshold", 2 * 4096L)
@@ -57,15 +55,15 @@ private class DiskStore(blockManager: BlockManager, diskManager: DiskBlockManage
     PutResult(bytes.limit(), Right(bytes.duplicate()))
   }
 
-  override def putValues(
+  override def putArray(
       blockId: BlockId,
-      values: ArrayBuffer[Any],
+      values: Array[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
-    putValues(blockId, values.toIterator, level, returnValues)
+    putIterator(blockId, values.toIterator, level, returnValues)
   }
 
-  override def putValues(
+  override def putIterator(
       blockId: BlockId,
       values: Iterator[Any],
       level: StorageLevel,
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 71f66c826c5b3..28f675c2bbb1e 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -20,27 +20,45 @@ package org.apache.spark.storage
 import java.nio.ByteBuffer
 import java.util.LinkedHashMap
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.util.{SizeEstimator, Utils}
+import org.apache.spark.util.collection.SizeTrackingVector
 
 private case class MemoryEntry(value: Any, size: Long, deserialized: Boolean)
 
 /**
- * Stores blocks in memory, either as ArrayBuffers of deserialized Java objects or as
+ * Stores blocks in memory, either as Arrays of deserialized Java objects or as
  * serialized ByteBuffers.
  */
-private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
+private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   extends BlockStore(blockManager) {
 
+  private val conf = blockManager.conf
   private val entries = new LinkedHashMap[BlockId, MemoryEntry](32, 0.75f, true)
+
   @volatile private var currentMemory = 0L
-  // Object used to ensure that only one thread is putting blocks and if necessary, dropping
-  // blocks from the memory store.
-  private val putLock = new Object()
+
+  // Ensure only one thread is putting, and if necessary, dropping blocks at any given time
+  private val accountingLock = new Object
+
+  // A mapping from thread ID to amount of memory used for unrolling a block (in bytes)
+  // All accesses of this map are assumed to have manually synchronized on `accountingLock`
+  private val unrollMemoryMap = mutable.HashMap[Long, Long]()
+
+  /**
+   * The amount of space ensured for unrolling values in memory, shared across all cores.
+   * This space is not reserved in advance, but allocated dynamically by dropping existing blocks.
+   */
+  private val maxUnrollMemory: Long = {
+    val unrollFraction = conf.getDouble("spark.storage.unrollFraction", 0.2)
+    (maxMemory * unrollFraction).toLong
+  }
 
   logInfo("MemoryStore started with capacity %s".format(Utils.bytesToString(maxMemory)))
 
+  /** Free memory not occupied by existing blocks. Note that this does not include unroll memory. */
   def freeMemory: Long = maxMemory - currentMemory
 
   override def getSize(blockId: BlockId): Long = {
@@ -55,20 +73,16 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     bytes.rewind()
     if (level.deserialized) {
       val values = blockManager.dataDeserialize(blockId, bytes)
-      val elements = new ArrayBuffer[Any]
-      elements ++= values
-      val sizeEstimate = SizeEstimator.estimate(elements.asInstanceOf[AnyRef])
-      val putAttempt = tryToPut(blockId, elements, sizeEstimate, deserialized = true)
-      PutResult(sizeEstimate, Left(values.toIterator), putAttempt.droppedBlocks)
+      putIterator(blockId, values, level, returnValues = true)
     } else {
       val putAttempt = tryToPut(blockId, bytes, bytes.limit, deserialized = false)
       PutResult(bytes.limit(), Right(bytes.duplicate()), putAttempt.droppedBlocks)
     }
   }
 
-  override def putValues(
+  override def putArray(
       blockId: BlockId,
-      values: ArrayBuffer[Any],
+      values: Array[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
     if (level.deserialized) {
@@ -82,14 +96,52 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     }
   }
 
-  override def putValues(
+  override def putIterator(
       blockId: BlockId,
       values: Iterator[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
-    val valueEntries = new ArrayBuffer[Any]()
-    valueEntries ++= values
-    putValues(blockId, valueEntries, level, returnValues)
+    putIterator(blockId, values, level, returnValues, allowPersistToDisk = true)
+  }
+
+  /**
+   * Attempt to put the given block in memory store.
+   *
+   * There may not be enough space to fully unroll the iterator in memory, in which case we
+   * optionally drop the values to disk if
+   *   (1) the block's storage level specifies useDisk, and
+   *   (2) `allowPersistToDisk` is true.
+   *
+   * One scenario in which `allowPersistToDisk` is false is when the BlockManager reads a block
+   * back from disk and attempts to cache it in memory. In this case, we should not persist the
+   * block back on disk again, as it is already in disk store.
+   */
+  private[storage] def putIterator(
+      blockId: BlockId,
+      values: Iterator[Any],
+      level: StorageLevel,
+      returnValues: Boolean,
+      allowPersistToDisk: Boolean): PutResult = {
+    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val unrolledValues = unrollSafely(blockId, values, droppedBlocks)
+    unrolledValues match {
+      case Left(arrayValues) =>
+        // Values are fully unrolled in memory, so store them as an array
+        val res = putArray(blockId, arrayValues, level, returnValues)
+        droppedBlocks ++= res.droppedBlocks
+        PutResult(res.size, res.data, droppedBlocks)
+      case Right(iteratorValues) =>
+        // Not enough space to unroll this block; drop to disk if applicable
+        logWarning(s"Not enough space to store block $blockId in memory! " +
+          s"Free memory is $freeMemory bytes.")
+        if (level.useDisk && allowPersistToDisk) {
+          logWarning(s"Persisting block $blockId to disk instead.")
+          val res = blockManager.diskStore.putIterator(blockId, iteratorValues, level, returnValues)
+          PutResult(res.size, res.data, droppedBlocks)
+        } else {
+          PutResult(0, Left(iteratorValues), droppedBlocks)
+        }
+    }
   }
 
   override def getBytes(blockId: BlockId): Option[ByteBuffer] = {
@@ -99,7 +151,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     if (entry == null) {
       None
     } else if (entry.deserialized) {
-      Some(blockManager.dataSerialize(blockId, entry.value.asInstanceOf[ArrayBuffer[Any]].iterator))
+      Some(blockManager.dataSerialize(blockId, entry.value.asInstanceOf[Array[Any]].iterator))
     } else {
       Some(entry.value.asInstanceOf[ByteBuffer].duplicate()) // Doesn't actually copy the data
     }
@@ -112,7 +164,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     if (entry == null) {
       None
     } else if (entry.deserialized) {
-      Some(entry.value.asInstanceOf[ArrayBuffer[Any]].iterator)
+      Some(entry.value.asInstanceOf[Array[Any]].iterator)
     } else {
       val buffer = entry.value.asInstanceOf[ByteBuffer].duplicate() // Doesn't actually copy data
       Some(blockManager.dataDeserialize(blockId, buffer))
@@ -140,6 +192,93 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     logInfo("MemoryStore cleared")
   }
 
+  /**
+   * Unroll the given block in memory safely.
+   *
+   * The safety of this operation refers to avoiding potential OOM exceptions caused by
+   * unrolling the entirety of the block in memory at once. This is achieved by periodically
+   * checking whether the memory restrictions for unrolling blocks are still satisfied,
+   * stopping immediately if not. This check is a safeguard against the scenario in which
+   * there is not enough free memory to accommodate the entirety of a single block.
+   *
+   * This method returns either an array with the contents of the entire block or an iterator
+   * containing the values of the block (if the array would have exceeded available memory).
+   */
+  def unrollSafely(
+      blockId: BlockId,
+      values: Iterator[Any],
+      droppedBlocks: ArrayBuffer[(BlockId, BlockStatus)])
+    : Either[Array[Any], Iterator[Any]] = {
+
+    // Number of elements unrolled so far
+    var elementsUnrolled = 0
+    // Whether there is still enough memory for us to continue unrolling this block
+    var keepUnrolling = true
+    // Initial per-thread memory to request for unrolling blocks (bytes). Exposed for testing.
+    val initialMemoryThreshold = conf.getLong("spark.storage.unrollMemoryThreshold", 1024 * 1024)
+    // How often to check whether we need to request more memory
+    val memoryCheckPeriod = 16
+    // Memory currently reserved by this thread for this particular unrolling operation
+    var memoryThreshold = initialMemoryThreshold
+    // Memory to request as a multiple of current vector size
+    val memoryGrowthFactor = 1.5
+    // Previous unroll memory held by this thread, for releasing later (only at the very end)
+    val previousMemoryReserved = currentUnrollMemoryForThisThread
+    // Underlying vector for unrolling the block
+    var vector = new SizeTrackingVector[Any]
+
+    // Request enough memory to begin unrolling
+    keepUnrolling = reserveUnrollMemoryForThisThread(initialMemoryThreshold)
+
+    // Unroll this block safely, checking whether we have exceeded our threshold periodically
+    try {
+      while (values.hasNext && keepUnrolling) {
+        vector += values.next()
+        if (elementsUnrolled % memoryCheckPeriod == 0) {
+          // If our vector's size has exceeded the threshold, request more memory
+          val currentSize = vector.estimateSize()
+          if (currentSize >= memoryThreshold) {
+            val amountToRequest = (currentSize * (memoryGrowthFactor - 1)).toLong
+            // Hold the accounting lock, in case another thread concurrently puts a block that
+            // takes up the unrolling space we just ensured here
+            accountingLock.synchronized {
+              if (!reserveUnrollMemoryForThisThread(amountToRequest)) {
+                // If the first request is not granted, try again after ensuring free space
+                // If there is still not enough space, give up and drop the partition
+                val spaceToEnsure = maxUnrollMemory - currentUnrollMemory
+                if (spaceToEnsure > 0) {
+                  val result = ensureFreeSpace(blockId, spaceToEnsure)
+                  droppedBlocks ++= result.droppedBlocks
+                }
+                keepUnrolling = reserveUnrollMemoryForThisThread(amountToRequest)
+              }
+            }
+            // New threshold is currentSize * memoryGrowthFactor
+            memoryThreshold = currentSize + amountToRequest
+          }
+        }
+        elementsUnrolled += 1
+      }
+
+      if (keepUnrolling) {
+        // We successfully unrolled the entirety of this block
+        Left(vector.toArray)
+      } else {
+        // We ran out of space while unrolling the values for this block
+        Right(vector.iterator ++ values)
+      }
+
+    } finally {
+      // If we return an array, the values returned do not depend on the underlying vector and
+      // we can immediately free up space for other threads. Otherwise, if we return an iterator,
+      // we release the memory claimed by this thread later on when the task finishes.
+      if (keepUnrolling) {
+        val amountToRelease = currentUnrollMemoryForThisThread - previousMemoryReserved
+        releaseUnrollMemoryForThisThread(amountToRelease)
+      }
+    }
+  }
+
   /**
    * Return the RDD ID that a given block ID is from, or None if it is not an RDD block.
    */
@@ -149,10 +288,10 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
 
   /**
    * Try to put in a set of values, if we can free up enough space. The value should either be
-   * an ArrayBuffer if deserialized is true or a ByteBuffer otherwise. Its (possibly estimated)
-   * size must also be passed by the caller.
+   * an Array if deserialized is true or a ByteBuffer otherwise. Its (possibly estimated) size
+   * must also be passed by the caller.
    *
-   * Lock on the object putLock to ensure that all the put requests and its associated block
+   * Synchronize on `accountingLock` to ensure that all the put requests and its associated block
    * dropping is done by only on thread at a time. Otherwise while one thread is dropping
    * blocks to free memory for one block, another thread may use up the freed space for
    * another block.
@@ -174,7 +313,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     var putSuccess = false
     val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
 
-    putLock.synchronized {
+    accountingLock.synchronized {
       val freeSpaceResult = ensureFreeSpace(blockId, size)
       val enoughFreeSpace = freeSpaceResult.success
       droppedBlocks ++= freeSpaceResult.droppedBlocks
@@ -193,7 +332,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         // Tell the block manager that we couldn't put it in memory so that it can drop it to
         // disk if the block allows disk storage.
         val data = if (deserialized) {
-          Left(value.asInstanceOf[ArrayBuffer[Any]])
+          Left(value.asInstanceOf[Array[Any]])
         } else {
           Right(value.asInstanceOf[ByteBuffer].duplicate())
         }
@@ -210,12 +349,14 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * from the same RDD (which leads to a wasteful cyclic replacement pattern for RDDs that
    * don't fit into memory that we want to avoid).
    *
-   * Assume that a lock is held by the caller to ensure only one thread is dropping blocks.
-   * Otherwise, the freed space may fill up before the caller puts in their new value.
+   * Assume that `accountingLock` is held by the caller to ensure only one thread is dropping
+   * blocks. Otherwise, the freed space may fill up before the caller puts in their new value.
    *
    * Return whether there is enough free space, along with the blocks dropped in the process.
    */
-  private def ensureFreeSpace(blockIdToAdd: BlockId, space: Long): ResultWithDroppedBlocks = {
+  private def ensureFreeSpace(
+      blockIdToAdd: BlockId,
+      space: Long): ResultWithDroppedBlocks = {
     logInfo(s"ensureFreeSpace($space) called with curMem=$currentMemory, maxMem=$maxMemory")
 
     val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
@@ -225,9 +366,12 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       return ResultWithDroppedBlocks(success = false, droppedBlocks)
     }
 
-    if (maxMemory - currentMemory < space) {
+    // Take into account the amount of memory currently occupied by unrolling blocks
+    val actualFreeMemory = freeMemory - currentUnrollMemory
+
+    if (actualFreeMemory < space) {
       val rddToAdd = getRddId(blockIdToAdd)
-      val selectedBlocks = new ArrayBuffer[BlockId]()
+      val selectedBlocks = new ArrayBuffer[BlockId]
       var selectedMemory = 0L
 
       // This is synchronized to ensure that the set of entries is not changed
@@ -235,7 +379,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       // can lead to exceptions.
       entries.synchronized {
         val iterator = entries.entrySet().iterator()
-        while (maxMemory - (currentMemory - selectedMemory) < space && iterator.hasNext) {
+        while (actualFreeMemory + selectedMemory < space && iterator.hasNext) {
           val pair = iterator.next()
           val blockId = pair.getKey
           if (rddToAdd.isEmpty || rddToAdd != getRddId(blockId)) {
@@ -245,7 +389,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         }
       }
 
-      if (maxMemory - (currentMemory - selectedMemory) >= space) {
+      if (actualFreeMemory + selectedMemory >= space) {
         logInfo(s"${selectedBlocks.size} blocks selected for dropping")
         for (blockId <- selectedBlocks) {
           val entry = entries.synchronized { entries.get(blockId) }
@@ -254,7 +398,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
           // future safety.
           if (entry != null) {
             val data = if (entry.deserialized) {
-              Left(entry.value.asInstanceOf[ArrayBuffer[Any]])
+              Left(entry.value.asInstanceOf[Array[Any]])
             } else {
               Right(entry.value.asInstanceOf[ByteBuffer].duplicate())
             }
@@ -275,8 +419,56 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   override def contains(blockId: BlockId): Boolean = {
     entries.synchronized { entries.containsKey(blockId) }
   }
+
+  /**
+   * Reserve additional memory for unrolling blocks used by this thread.
+   * Return whether the request is granted.
+   */
+  private[spark] def reserveUnrollMemoryForThisThread(memory: Long): Boolean = {
+    accountingLock.synchronized {
+      val granted = freeMemory > currentUnrollMemory + memory
+      if (granted) {
+        val threadId = Thread.currentThread().getId
+        unrollMemoryMap(threadId) = unrollMemoryMap.getOrElse(threadId, 0L) + memory
+      }
+      granted
+    }
+  }
+
+  /**
+   * Release memory used by this thread for unrolling blocks.
+   * If the amount is not specified, remove the current thread's allocation altogether.
+   */
+  private[spark] def releaseUnrollMemoryForThisThread(memory: Long = -1L): Unit = {
+    val threadId = Thread.currentThread().getId
+    accountingLock.synchronized {
+      if (memory < 0) {
+        unrollMemoryMap.remove(threadId)
+      } else {
+        unrollMemoryMap(threadId) = unrollMemoryMap.getOrElse(threadId, memory) - memory
+        // If this thread claims no more unroll memory, release it completely
+        if (unrollMemoryMap(threadId) <= 0) {
+          unrollMemoryMap.remove(threadId)
+        }
+      }
+    }
+  }
+
+  /**
+   * Return the amount of memory currently occupied for unrolling blocks across all threads.
+   */
+  private[spark] def currentUnrollMemory: Long = accountingLock.synchronized {
+    unrollMemoryMap.values.sum
+  }
+
+  /**
+   * Return the amount of memory currently occupied for unrolling blocks by this thread.
+   */
+  private[spark] def currentUnrollMemoryForThisThread: Long = accountingLock.synchronized {
+    unrollMemoryMap.getOrElse(Thread.currentThread().getId, 0L)
+  }
 }
 
-private case class ResultWithDroppedBlocks(
+private[spark] case class ResultWithDroppedBlocks(
     success: Boolean,
     droppedBlocks: Seq[(BlockId, BlockStatus)])
diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
index 5a72e216872a6..120c327a7e580 100644
--- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
+++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala
@@ -34,6 +34,8 @@ class RDDInfo(
   var diskSize = 0L
   var tachyonSize = 0L
 
+  def isCached: Boolean = (memSize + diskSize + tachyonSize > 0) && numCachedPartitions > 0
+
   override def toString = {
     import Utils.bytesToString
     ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " +
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
index 35910e552fe86..28aa35bc7e147 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
@@ -28,6 +28,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.storage.ShuffleBlockManager.ShuffleFileGroup
 import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap}
 import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
+import org.apache.spark.shuffle.sort.SortShuffleManager
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
@@ -58,6 +59,7 @@ private[spark] trait ShuffleWriterGroup {
  * each block stored in each file. In order to find the location of a shuffle block, we search the
  * files within a ShuffleFileGroups associated with the block's reducer.
  */
+// TODO: Factor this into a separate class for each ShuffleManager implementation
 private[spark]
 class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   def conf = blockManager.conf
@@ -67,6 +69,10 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   val consolidateShuffleFiles =
     conf.getBoolean("spark.shuffle.consolidateFiles", false)
 
+  // Are we using sort-based shuffle?
+  val sortBasedShuffle =
+    conf.get("spark.shuffle.manager", "") == classOf[SortShuffleManager].getName
+
   private val bufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
 
   /**
@@ -91,6 +97,20 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   private val metadataCleaner =
     new MetadataCleaner(MetadataCleanerType.SHUFFLE_BLOCK_MANAGER, this.cleanup, conf)
 
+  /**
+   * Register a completed map without getting a ShuffleWriterGroup. Used by sort-based shuffle
+   * because it just writes a single file by itself.
+   */
+  def addCompletedMap(shuffleId: Int, mapId: Int, numBuckets: Int): Unit = {
+    shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
+    val shuffleState = shuffleStates(shuffleId)
+    shuffleState.completedMapTasks.add(mapId)
+  }
+
+  /**
+   * Get a ShuffleWriterGroup for the given map task, which will register it as complete
+   * when the writers are closed successfully
+   */
   def forMapTask(shuffleId: Int, mapId: Int, numBuckets: Int, serializer: Serializer) = {
     new ShuffleWriterGroup {
       shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
@@ -124,7 +144,8 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
         if (consolidateShuffleFiles) {
           if (success) {
             val offsets = writers.map(_.fileSegment().offset)
-            fileGroup.recordMapOutput(mapId, offsets)
+            val lengths = writers.map(_.fileSegment().length)
+            fileGroup.recordMapOutput(mapId, offsets, lengths)
           }
           recycleFileGroup(fileGroup)
         } else {
@@ -182,7 +203,14 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   private def removeShuffleBlocks(shuffleId: ShuffleId): Boolean = {
     shuffleStates.get(shuffleId) match {
       case Some(state) =>
-        if (consolidateShuffleFiles) {
+        if (sortBasedShuffle) {
+          // There's a single block ID for each map, plus an index file for it
+          for (mapId <- state.completedMapTasks) {
+            val blockId = new ShuffleBlockId(shuffleId, mapId, 0)
+            blockManager.diskBlockManager.getFile(blockId).delete()
+            blockManager.diskBlockManager.getFile(blockId.name + ".index").delete()
+          }
+        } else if (consolidateShuffleFiles) {
           for (fileGroup <- state.allFileGroups; file <- fileGroup.files) {
             file.delete()
           }
@@ -220,6 +248,8 @@ object ShuffleBlockManager {
    * A particular mapper will be assigned a single ShuffleFileGroup to write its output to.
    */
   private class ShuffleFileGroup(val shuffleId: Int, val fileId: Int, val files: Array[File]) {
+    private var numBlocks: Int = 0
+
     /**
      * Stores the absolute index of each mapId in the files of this group. For instance,
      * if mapId 5 is the first block in each file, mapIdToIndex(5) = 0.
@@ -227,23 +257,27 @@ object ShuffleBlockManager {
     private val mapIdToIndex = new PrimitiveKeyOpenHashMap[Int, Int]()
 
     /**
-     * Stores consecutive offsets of blocks into each reducer file, ordered by position in the file.
-     * This ordering allows us to compute block lengths by examining the following block offset.
+     * Stores consecutive offsets and lengths of blocks into each reducer file, ordered by
+     * position in the file.
      * Note: mapIdToIndex(mapId) returns the index of the mapper into the vector for every
      * reducer.
      */
     private val blockOffsetsByReducer = Array.fill[PrimitiveVector[Long]](files.length) {
       new PrimitiveVector[Long]()
     }
-
-    def numBlocks = mapIdToIndex.size
+    private val blockLengthsByReducer = Array.fill[PrimitiveVector[Long]](files.length) {
+      new PrimitiveVector[Long]()
+    }
 
     def apply(bucketId: Int) = files(bucketId)
 
-    def recordMapOutput(mapId: Int, offsets: Array[Long]) {
+    def recordMapOutput(mapId: Int, offsets: Array[Long], lengths: Array[Long]) {
+      assert(offsets.length == lengths.length)
       mapIdToIndex(mapId) = numBlocks
+      numBlocks += 1
       for (i <- 0 until offsets.length) {
         blockOffsetsByReducer(i) += offsets(i)
+        blockLengthsByReducer(i) += lengths(i)
       }
     }
 
@@ -251,16 +285,11 @@ object ShuffleBlockManager {
     def getFileSegmentFor(mapId: Int, reducerId: Int): Option[FileSegment] = {
       val file = files(reducerId)
       val blockOffsets = blockOffsetsByReducer(reducerId)
+      val blockLengths = blockLengthsByReducer(reducerId)
       val index = mapIdToIndex.getOrElse(mapId, -1)
       if (index >= 0) {
         val offset = blockOffsets(index)
-        val length =
-          if (index + 1 < numBlocks) {
-            blockOffsets(index + 1) - offset
-          } else {
-            file.length() - offset
-          }
-        assert(length >= 0)
+        val length = blockLengths(index)
         Some(new FileSegment(file, offset, length))
       } else {
         None
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
index 41c960c867e2e..d9066f766476e 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
@@ -35,13 +35,12 @@ class StorageStatusListener extends SparkListener {
 
   /** Update storage status list to reflect updated block statuses */
   private def updateStorageStatus(execId: String, updatedBlocks: Seq[(BlockId, BlockStatus)]) {
-    val filteredStatus = executorIdToStorageStatus.get(execId)
-    filteredStatus.foreach { storageStatus =>
+    executorIdToStorageStatus.get(execId).foreach { storageStatus =>
       updatedBlocks.foreach { case (blockId, updatedStatus) =>
         if (updatedStatus.storageLevel == StorageLevel.NONE) {
-          storageStatus.blocks.remove(blockId)
+          storageStatus.removeBlock(blockId)
         } else {
-          storageStatus.blocks(blockId) = updatedStatus
+          storageStatus.updateBlock(blockId, updatedStatus)
         }
       }
     }
@@ -50,9 +49,8 @@ class StorageStatusListener extends SparkListener {
   /** Update storage status list to reflect the removal of an RDD from the cache */
   private def updateStorageStatus(unpersistedRDDId: Int) {
     storageStatusList.foreach { storageStatus =>
-      val unpersistedBlocksIds = storageStatus.rddBlocks.keys.filter(_.rddId == unpersistedRDDId)
-      unpersistedBlocksIds.foreach { blockId =>
-        storageStatus.blocks.remove(blockId)
+      storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) =>
+        storageStatus.removeBlock(blockId)
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
index 177281f663367..2bd6b749be261 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
@@ -20,122 +20,255 @@ package org.apache.spark.storage
 import scala.collection.Map
 import scala.collection.mutable
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.DeveloperApi
 
 /**
  * :: DeveloperApi ::
  * Storage information for each BlockManager.
+ *
+ * This class assumes BlockId and BlockStatus are immutable, such that the consumers of this
+ * class cannot mutate the source of the information. Accesses are not thread-safe.
  */
 @DeveloperApi
-class StorageStatus(
-    val blockManagerId: BlockManagerId,
-    val maxMem: Long,
-    val blocks: mutable.Map[BlockId, BlockStatus] = mutable.Map.empty) {
+class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
-  def memUsed = blocks.values.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
+  /**
+   * Internal representation of the blocks stored in this block manager.
+   *
+   * We store RDD blocks and non-RDD blocks separately to allow quick retrievals of RDD blocks.
+   * These collections should only be mutated through the add/update/removeBlock methods.
+   */
+  private val _rddBlocks = new mutable.HashMap[Int, mutable.Map[BlockId, BlockStatus]]
+  private val _nonRddBlocks = new mutable.HashMap[BlockId, BlockStatus]
 
-  def memUsedByRDD(rddId: Int) =
-    rddBlocks.filterKeys(_.rddId == rddId).values.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
+  /**
+   * Storage information of the blocks that entails memory, disk, and off-heap memory usage.
+   *
+   * As with the block maps, we store the storage information separately for RDD blocks and
+   * non-RDD blocks for the same reason. In particular, RDD storage information is stored
+   * in a map indexed by the RDD ID to the following 4-tuple:
+   *
+   *   (memory size, disk size, off-heap size, storage level)
+   *
+   * We assume that all the blocks that belong to the same RDD have the same storage level.
+   * This field is not relevant to non-RDD blocks, however, so the storage information for
+   * non-RDD blocks contains only the first 3 fields (in the same order).
+   */
+  private val _rddStorageInfo = new mutable.HashMap[Int, (Long, Long, Long, StorageLevel)]
+  private var _nonRddStorageInfo: (Long, Long, Long) = (0L, 0L, 0L)
 
-  def diskUsed = blocks.values.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
+  /** Create a storage status with an initial set of blocks, leaving the source unmodified. */
+  def this(bmid: BlockManagerId, maxMem: Long, initialBlocks: Map[BlockId, BlockStatus]) {
+    this(bmid, maxMem)
+    initialBlocks.foreach { case (bid, bstatus) => addBlock(bid, bstatus) }
+  }
 
-  def diskUsedByRDD(rddId: Int) =
-    rddBlocks.filterKeys(_.rddId == rddId).values.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
+  /**
+   * Return the blocks stored in this block manager.
+   *
+   * Note that this is somewhat expensive, as it involves cloning the underlying maps and then
+   * concatenating them together. Much faster alternatives exist for common operations such as
+   * contains, get, and size.
+   */
+  def blocks: Map[BlockId, BlockStatus] = _nonRddBlocks ++ rddBlocks
 
-  def memRemaining: Long = maxMem - memUsed
+  /**
+   * Return the RDD blocks stored in this block manager.
+   *
+   * Note that this is somewhat expensive, as it involves cloning the underlying maps and then
+   * concatenating them together. Much faster alternatives exist for common operations such as
+   * getting the memory, disk, and off-heap memory sizes occupied by this RDD.
+   */
+  def rddBlocks: Map[BlockId, BlockStatus] = _rddBlocks.flatMap { case (_, blocks) => blocks }
 
-  def rddBlocks = blocks.collect { case (rdd: RDDBlockId, status) => (rdd, status) }
-}
+  /** Return the blocks that belong to the given RDD stored in this block manager. */
+  def rddBlocksById(rddId: Int): Map[BlockId, BlockStatus] = {
+    _rddBlocks.get(rddId).getOrElse(Map.empty)
+  }
 
-/** Helper methods for storage-related objects. */
-private[spark] object StorageUtils {
+  /** Add the given block to this storage status. If it already exists, overwrite it. */
+  private[spark] def addBlock(blockId: BlockId, blockStatus: BlockStatus): Unit = {
+    updateStorageInfo(blockId, blockStatus)
+    blockId match {
+      case RDDBlockId(rddId, _) =>
+        _rddBlocks.getOrElseUpdate(rddId, new mutable.HashMap)(blockId) = blockStatus
+      case _ =>
+        _nonRddBlocks(blockId) = blockStatus
+    }
+  }
+
+  /** Update the given block in this storage status. If it doesn't already exist, add it. */
+  private[spark] def updateBlock(blockId: BlockId, blockStatus: BlockStatus): Unit = {
+    addBlock(blockId, blockStatus)
+  }
+
+  /** Remove the given block from this storage status. */
+  private[spark] def removeBlock(blockId: BlockId): Option[BlockStatus] = {
+    updateStorageInfo(blockId, BlockStatus.empty)
+    blockId match {
+      case RDDBlockId(rddId, _) =>
+        // Actually remove the block, if it exists
+        if (_rddBlocks.contains(rddId)) {
+          val removed = _rddBlocks(rddId).remove(blockId)
+          // If the given RDD has no more blocks left, remove the RDD
+          if (_rddBlocks(rddId).isEmpty) {
+            _rddBlocks.remove(rddId)
+          }
+          removed
+        } else {
+          None
+        }
+      case _ =>
+        _nonRddBlocks.remove(blockId)
+    }
+  }
 
   /**
-   * Returns basic information of all RDDs persisted in the given SparkContext. This does not
-   * include storage information.
+   * Return whether the given block is stored in this block manager in O(1) time.
+   * Note that this is much faster than `this.blocks.contains`, which is O(blocks) time.
    */
-  def rddInfoFromSparkContext(sc: SparkContext): Array[RDDInfo] = {
-    sc.persistentRdds.values.map { rdd =>
-      val rddName = Option(rdd.name).getOrElse(rdd.id.toString)
-      val rddNumPartitions = rdd.partitions.size
-      val rddStorageLevel = rdd.getStorageLevel
-      val rddInfo = new RDDInfo(rdd.id, rddName, rddNumPartitions, rddStorageLevel)
-      rddInfo
-    }.toArray
+  def containsBlock(blockId: BlockId): Boolean = {
+    blockId match {
+      case RDDBlockId(rddId, _) =>
+        _rddBlocks.get(rddId).exists(_.contains(blockId))
+      case _ =>
+        _nonRddBlocks.contains(blockId)
+    }
   }
 
-  /** Returns storage information of all RDDs persisted in the given SparkContext. */
-  def rddInfoFromStorageStatus(
-      storageStatuses: Seq[StorageStatus],
-      sc: SparkContext): Array[RDDInfo] = {
-    rddInfoFromStorageStatus(storageStatuses, rddInfoFromSparkContext(sc))
+  /**
+   * Return the given block stored in this block manager in O(1) time.
+   * Note that this is much faster than `this.blocks.get`, which is O(blocks) time.
+   */
+  def getBlock(blockId: BlockId): Option[BlockStatus] = {
+    blockId match {
+      case RDDBlockId(rddId, _) =>
+        _rddBlocks.get(rddId).map(_.get(blockId)).flatten
+      case _ =>
+        _nonRddBlocks.get(blockId)
+    }
   }
 
-  /** Returns storage information of all RDDs in the given list. */
-  def rddInfoFromStorageStatus(
-      storageStatuses: Seq[StorageStatus],
-      rddInfos: Seq[RDDInfo],
-      updatedBlocks: Seq[(BlockId, BlockStatus)] = Seq.empty): Array[RDDInfo] = {
-
-    // Mapping from a block ID -> its status
-    val blockMap = mutable.Map(storageStatuses.flatMap(_.rddBlocks): _*)
-
-    // Record updated blocks, if any
-    updatedBlocks
-      .collect { case (id: RDDBlockId, status) => (id, status) }
-      .foreach { case (id, status) => blockMap(id) = status }
-
-    // Mapping from RDD ID -> an array of associated BlockStatuses
-    val rddBlockMap = blockMap
-      .groupBy { case (k, _) => k.rddId }
-      .mapValues(_.values.toArray)
-
-    // Mapping from RDD ID -> the associated RDDInfo (with potentially outdated storage information)
-    val rddInfoMap = rddInfos.map { info => (info.id, info) }.toMap
-
-    val rddStorageInfos = rddBlockMap.flatMap { case (rddId, blocks) =>
-      // Add up memory, disk and Tachyon sizes
-      val persistedBlocks =
-        blocks.filter { status => status.memSize + status.diskSize + status.tachyonSize > 0 }
-      val _storageLevel =
-        if (persistedBlocks.length > 0) persistedBlocks(0).storageLevel else StorageLevel.NONE
-      val memSize = persistedBlocks.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
-      val diskSize = persistedBlocks.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
-      val tachyonSize = persistedBlocks.map(_.tachyonSize).reduceOption(_ + _).getOrElse(0L)
-      rddInfoMap.get(rddId).map { rddInfo =>
-        rddInfo.storageLevel = _storageLevel
-        rddInfo.numCachedPartitions = persistedBlocks.length
-        rddInfo.memSize = memSize
-        rddInfo.diskSize = diskSize
-        rddInfo.tachyonSize = tachyonSize
-        rddInfo
-      }
-    }.toArray
+  /**
+   * Return the number of blocks stored in this block manager in O(RDDs) time.
+   * Note that this is much faster than `this.blocks.size`, which is O(blocks) time.
+   */
+  def numBlocks: Int = _nonRddBlocks.size + numRddBlocks
+
+  /**
+   * Return the number of RDD blocks stored in this block manager in O(RDDs) time.
+   * Note that this is much faster than `this.rddBlocks.size`, which is O(RDD blocks) time.
+   */
+  def numRddBlocks: Int = _rddBlocks.values.map(_.size).sum
 
-    scala.util.Sorting.quickSort(rddStorageInfos)
-    rddStorageInfos
+  /**
+   * Return the number of blocks that belong to the given RDD in O(1) time.
+   * Note that this is much faster than `this.rddBlocksById(rddId).size`, which is
+   * O(blocks in this RDD) time.
+   */
+  def numRddBlocksById(rddId: Int): Int = _rddBlocks.get(rddId).map(_.size).getOrElse(0)
+
+  /** Return the memory remaining in this block manager. */
+  def memRemaining: Long = maxMem - memUsed
+
+  /** Return the memory used by this block manager. */
+  def memUsed: Long = _nonRddStorageInfo._1 + _rddBlocks.keys.toSeq.map(memUsedByRdd).sum
+
+  /** Return the disk space used by this block manager. */
+  def diskUsed: Long = _nonRddStorageInfo._2 + _rddBlocks.keys.toSeq.map(diskUsedByRdd).sum
+
+  /** Return the off-heap space used by this block manager. */
+  def offHeapUsed: Long = _nonRddStorageInfo._3 + _rddBlocks.keys.toSeq.map(offHeapUsedByRdd).sum
+
+  /** Return the memory used by the given RDD in this block manager in O(1) time. */
+  def memUsedByRdd(rddId: Int): Long = _rddStorageInfo.get(rddId).map(_._1).getOrElse(0L)
+
+  /** Return the disk space used by the given RDD in this block manager in O(1) time. */
+  def diskUsedByRdd(rddId: Int): Long = _rddStorageInfo.get(rddId).map(_._2).getOrElse(0L)
+
+  /** Return the off-heap space used by the given RDD in this block manager in O(1) time. */
+  def offHeapUsedByRdd(rddId: Int): Long = _rddStorageInfo.get(rddId).map(_._3).getOrElse(0L)
+
+  /** Return the storage level, if any, used by the given RDD in this block manager. */
+  def rddStorageLevel(rddId: Int): Option[StorageLevel] = _rddStorageInfo.get(rddId).map(_._4)
+
+  /**
+   * Update the relevant storage info, taking into account any existing status for this block.
+   */
+  private def updateStorageInfo(blockId: BlockId, newBlockStatus: BlockStatus): Unit = {
+    val oldBlockStatus = getBlock(blockId).getOrElse(BlockStatus.empty)
+    val changeInMem = newBlockStatus.memSize - oldBlockStatus.memSize
+    val changeInDisk = newBlockStatus.diskSize - oldBlockStatus.diskSize
+    val changeInTachyon = newBlockStatus.tachyonSize - oldBlockStatus.tachyonSize
+    val level = newBlockStatus.storageLevel
+
+    // Compute new info from old info
+    val (oldMem, oldDisk, oldTachyon) = blockId match {
+      case RDDBlockId(rddId, _) =>
+        _rddStorageInfo.get(rddId)
+          .map { case (mem, disk, tachyon, _) => (mem, disk, tachyon) }
+          .getOrElse((0L, 0L, 0L))
+      case _ =>
+        _nonRddStorageInfo
+    }
+    val newMem = math.max(oldMem + changeInMem, 0L)
+    val newDisk = math.max(oldDisk + changeInDisk, 0L)
+    val newTachyon = math.max(oldTachyon + changeInTachyon, 0L)
+
+    // Set the correct info
+    blockId match {
+      case RDDBlockId(rddId, _) =>
+        // If this RDD is no longer persisted, remove it
+        if (newMem + newDisk + newTachyon == 0) {
+          _rddStorageInfo.remove(rddId)
+        } else {
+          _rddStorageInfo(rddId) = (newMem, newDisk, newTachyon, level)
+        }
+      case _ =>
+        _nonRddStorageInfo = (newMem, newDisk, newTachyon)
+    }
   }
 
-  /** Returns a mapping from BlockId to the locations of the associated block. */
-  def blockLocationsFromStorageStatus(
-      storageStatuses: Seq[StorageStatus]): Map[BlockId, Seq[String]] = {
-    val blockLocationPairs = storageStatuses.flatMap { storageStatus =>
-      storageStatus.blocks.map { case (bid, _) => (bid, storageStatus.blockManagerId.hostPort) }
+}
+
+/** Helper methods for storage-related objects. */
+private[spark] object StorageUtils {
+
+  /**
+   * Update the given list of RDDInfo with the given list of storage statuses.
+   * This method overwrites the old values stored in the RDDInfo's.
+   */
+  def updateRddInfo(rddInfos: Seq[RDDInfo], statuses: Seq[StorageStatus]): Unit = {
+    rddInfos.foreach { rddInfo =>
+      val rddId = rddInfo.id
+      // Assume all blocks belonging to the same RDD have the same storage level
+      val storageLevel = statuses
+        .flatMap(_.rddStorageLevel(rddId)).headOption.getOrElse(StorageLevel.NONE)
+      val numCachedPartitions = statuses.map(_.numRddBlocksById(rddId)).sum
+      val memSize = statuses.map(_.memUsedByRdd(rddId)).sum
+      val diskSize = statuses.map(_.diskUsedByRdd(rddId)).sum
+      val tachyonSize = statuses.map(_.offHeapUsedByRdd(rddId)).sum
+
+      rddInfo.storageLevel = storageLevel
+      rddInfo.numCachedPartitions = numCachedPartitions
+      rddInfo.memSize = memSize
+      rddInfo.diskSize = diskSize
+      rddInfo.tachyonSize = tachyonSize
     }
-    blockLocationPairs.toMap
-      .groupBy { case (blockId, _) => blockId }
-      .mapValues(_.values.toSeq)
   }
 
-  /** Filters the given list of StorageStatus by the given RDD ID. */
-  def filterStorageStatusByRDD(
-      storageStatuses: Seq[StorageStatus],
-      rddId: Int): Array[StorageStatus] = {
-    storageStatuses.map { status =>
-      val filteredBlocks = status.rddBlocks.filterKeys(_.rddId == rddId).toSeq
-      val filteredBlockMap = mutable.Map[BlockId, BlockStatus](filteredBlocks: _*)
-      new StorageStatus(status.blockManagerId, status.maxMem, filteredBlockMap)
-    }.toArray
+  /**
+   * Return a mapping from block ID to its locations for each block that belongs to the given RDD.
+   */
+  def getRddBlockLocations(rddId: Int, statuses: Seq[StorageStatus]): Map[BlockId, Seq[String]] = {
+    val blockLocations = new mutable.HashMap[BlockId, mutable.ListBuffer[String]]
+    statuses.foreach { status =>
+      status.rddBlocksById(rddId).foreach { case (bid, _) =>
+        val location = status.blockManagerId.hostPort
+        blockLocations.getOrElseUpdate(bid, mutable.ListBuffer.empty) += location
+      }
+    }
+    blockLocations
   }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala b/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala
index d8ff4ff6bd42c..932b5616043b4 100644
--- a/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/TachyonStore.scala
@@ -20,8 +20,6 @@ package org.apache.spark.storage
 import java.io.IOException
 import java.nio.ByteBuffer
 
-import scala.collection.mutable.ArrayBuffer
-
 import tachyon.client.{ReadType, WriteType}
 
 import org.apache.spark.Logging
@@ -30,7 +28,7 @@ import org.apache.spark.util.Utils
 /**
  * Stores BlockManager blocks on Tachyon.
  */
-private class TachyonStore(
+private[spark] class TachyonStore(
     blockManager: BlockManager,
     tachyonManager: TachyonBlockManager)
   extends BlockStore(blockManager: BlockManager) with Logging {
@@ -45,15 +43,15 @@ private class TachyonStore(
     putIntoTachyonStore(blockId, bytes, returnValues = true)
   }
 
-  override def putValues(
+  override def putArray(
       blockId: BlockId,
-      values: ArrayBuffer[Any],
+      values: Array[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
-    putValues(blockId, values.toIterator, level, returnValues)
+    putIterator(blockId, values.toIterator, level, returnValues)
   }
 
-  override def putValues(
+  override def putIterator(
       blockId: BlockId,
       values: Iterator[Any],
       level: StorageLevel,
diff --git a/core/src/main/scala/org/apache/spark/storage/ThreadingTest.scala b/core/src/main/scala/org/apache/spark/storage/ThreadingTest.scala
index 328be158db680..75c2e09a6bbb8 100644
--- a/core/src/main/scala/org/apache/spark/storage/ThreadingTest.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ThreadingTest.scala
@@ -48,7 +48,7 @@ private[spark] object ThreadingTest {
         val block = (1 to blockSize).map(_ => Random.nextInt())
         val level = randomLevel()
         val startTime = System.currentTimeMillis()
-        manager.put(blockId, block.iterator, level, tellMaster = true)
+        manager.putIterator(blockId, block.iterator, level, tellMaster = true)
         println("Pushed block " + blockId + " in " + (System.currentTimeMillis - startTime) + " ms")
         queue.add((blockId, block))
       }
diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
new file mode 100644
index 0000000000000..9ced9b8107ebf
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui
+
+private[spark] object ToolTips {
+  val SCHEDULER_DELAY =
+    """Scheduler delay includes time to ship the task from the scheduler to
+       the executor, and time to send the task result from the executor to the scheduler. If
+       scheduler delay is large, consider decreasing the size of tasks or decreasing the size
+       of task results."""
+
+  val INPUT = "Bytes read from Hadoop or from Spark storage."
+
+  val SHUFFLE_WRITE = "Bytes written to disk in order to be read by a shuffle in a future stage."
+
+  val SHUFFLE_READ =
+    """Bytes read from remote executors. Typically less than shuffle write bytes
+       because this does not include shuffle data read locally."""
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 1b104253d545d..715cc2f4df8dd 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -25,6 +25,7 @@ import org.apache.spark.Logging
 
 /** Utility functions for generating XML pages with spark content. */
 private[spark] object UIUtils extends Logging {
+  val TABLE_CLASS = "table table-bordered table-striped table-condensed sortable"
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
@@ -135,10 +136,31 @@ private[spark] object UIUtils extends Logging {
   }
 
   // Yarn has to go through a proxy so the base uri is provided and has to be on all links
-  val uiRoot : String = Option(System.getenv("APPLICATION_WEB_PROXY_BASE")).getOrElse("")
+  def uiRoot: String = {
+    if (System.getenv("APPLICATION_WEB_PROXY_BASE") != null) {
+      System.getenv("APPLICATION_WEB_PROXY_BASE")
+    } else if (System.getProperty("spark.ui.proxyBase") != null) {
+      System.getProperty("spark.ui.proxyBase")
+    }
+    else {
+      ""
+    }
+  }
 
   def prependBaseUri(basePath: String = "", resource: String = "") = uiRoot + basePath + resource
 
+  def commonHeaderNodes = {
+    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+    <link rel="stylesheet" href={prependBaseUri("/static/bootstrap.min.css")}
+          type="text/css" />
+    <link rel="stylesheet" href={prependBaseUri("/static/webui.css")}
+          type="text/css" />
+    <script src={prependBaseUri("/static/sorttable.js")} ></script>
+    <script src={prependBaseUri("/static/jquery-1.11.1.min.js")}></script>
+    <script src={prependBaseUri("/static/bootstrap-tooltip.js")}></script>
+    <script src={prependBaseUri("/static/initialize-tooltips.js")}></script>
+  }
+
   /** Returns a spark page with correctly formatted headers */
   def headerSparkPage(
       content: => Seq[Node],
@@ -157,12 +179,7 @@ private[spark] object UIUtils extends Logging {
 
     <html>
       <head>
-        <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
-        <link rel="stylesheet" href={prependBaseUri("/static/bootstrap.min.css")}
-              type="text/css" />
-        <link rel="stylesheet" href={prependBaseUri("/static/webui.css")}
-              type="text/css" />
-        <script src={prependBaseUri("/static/sorttable.js")} ></script>
+        {commonHeaderNodes}
         <title>{appName} - {title}</title>
       </head>
       <body>
@@ -193,11 +210,7 @@ private[spark] object UIUtils extends Logging {
   def basicSparkPage(content: => Seq[Node], title: String): Seq[Node] = {
     <html>
       <head>
-        <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
-        <link rel="stylesheet" href={prependBaseUri("/static/bootstrap.min.css")}
-              type="text/css" />
-        <link rel="stylesheet" href={prependBaseUri("/static/webui.css")}  type="text/css" />
-        <script src={prependBaseUri("/static/sorttable.js")} ></script>
+        {commonHeaderNodes}
         <title>{title}</title>
       </head>
       <body>
@@ -224,9 +237,9 @@ private[spark] object UIUtils extends Logging {
       data: Seq[T],
       fixedWidth: Boolean = false): Seq[Node] = {
 
-    var tableClass = "table table-bordered table-striped table-condensed sortable"
+    var listingTableClass = TABLE_CLASS
     if (fixedWidth) {
-      tableClass += " table-fixed"
+      listingTableClass += " table-fixed"
     }
     val colWidth = 100.toDouble / headers.size
     val colWidthAttr = if (fixedWidth) colWidth + "%" else ""
@@ -246,7 +259,7 @@ private[spark] object UIUtils extends Logging {
         }
       }
     }
-    <table class={tableClass}>
+    <table class={listingTableClass}>
       <thead>{headerRow}</thead>
       <tbody>
         {data.map(r => generateDataRow(r))}
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
index 95b4a4e91d333..b814b0e6b8509 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -22,9 +22,26 @@ import javax.servlet.http.HttpServletRequest
 import scala.xml.Node
 
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.ui.{WebUIPage, UIUtils}
+import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage}
 import org.apache.spark.util.Utils
 
+/** Summary information about an executor to display in the UI. */
+private case class ExecutorSummaryInfo(
+    id: String,
+    hostPort: String,
+    rddBlocks: Int,
+    memoryUsed: Long,
+    diskUsed: Long,
+    activeTasks: Int,
+    failedTasks: Int,
+    completedTasks: Int,
+    totalTasks: Int,
+    totalDuration: Long,
+    totalInputBytes: Long,
+    totalShuffleRead: Long,
+    totalShuffleWrite: Long,
+    maxMemory: Long)
+
 private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
   private val appName = parent.appName
   private val basePath = parent.basePath
@@ -32,12 +49,40 @@ private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
 
   def render(request: HttpServletRequest): Seq[Node] = {
     val storageStatusList = listener.storageStatusList
-    val maxMem = storageStatusList.map(_.maxMem).fold(0L)(_ + _)
-    val memUsed = storageStatusList.map(_.memUsed).fold(0L)(_ + _)
-    val diskSpaceUsed = storageStatusList.flatMap(_.blocks.values.map(_.diskSize)).fold(0L)(_ + _)
+    val maxMem = storageStatusList.map(_.maxMem).sum
+    val memUsed = storageStatusList.map(_.memUsed).sum
+    val diskUsed = storageStatusList.map(_.diskUsed).sum
     val execInfo = for (statusId <- 0 until storageStatusList.size) yield getExecInfo(statusId)
-    val execInfoSorted = execInfo.sortBy(_.getOrElse("Executor ID", ""))
-    val execTable = UIUtils.listingTable(execHeader, execRow, execInfoSorted)
+    val execInfoSorted = execInfo.sortBy(_.id)
+
+    val execTable =
+      <table class={UIUtils.TABLE_CLASS}>
+        <thead>
+          <th>Executor ID</th>
+          <th>Address</th>
+          <th>RDD Blocks</th>
+          <th>Memory Used</th>
+          <th>Disk Used</th>
+          <th>Active Tasks</th>
+          <th>Failed Tasks</th>
+          <th>Complete Tasks</th>
+          <th>Total Tasks</th>
+          <th>Task Time</th>
+          <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
+          <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
+          <th>
+            <!-- Place the shuffle write tooltip on the left (rather than the default position
+              of on top) because the shuffle write column is the last column on the right side and
+              the tooltip is wider than the column, so it doesn't fit on top. -->
+            <span data-toggle="tooltip" data-placement="left" title={ToolTips.SHUFFLE_WRITE}>
+              Shuffle Write
+            </span>
+          </th>
+        </thead>
+        <tbody>
+          {execInfoSorted.map(execRow)}
+        </tbody>
+      </table>
 
     val content =
       <div class="row-fluid">
@@ -46,7 +91,7 @@ private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
             <li><strong>Memory:</strong>
               {Utils.bytesToString(memUsed)} Used
               ({Utils.bytesToString(maxMem)} Total) </li>
-            <li><strong>Disk:</strong> {Utils.bytesToString(diskSpaceUsed)} Used </li>
+            <li><strong>Disk:</strong> {Utils.bytesToString(diskUsed)} Used </li>
           </ul>
         </div>
       </div>
@@ -60,57 +105,47 @@ private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
       parent.headerTabs, parent)
   }
 
-  /** Header fields for the executors table */
-  private def execHeader = Seq(
-    "Executor ID",
-    "Address",
-    "RDD Blocks",
-    "Memory Used",
-    "Disk Used",
-    "Active Tasks",
-    "Failed Tasks",
-    "Complete Tasks",
-    "Total Tasks",
-    "Task Time",
-    "Input Bytes",
-    "Shuffle Read",
-    "Shuffle Write")
-
   /** Render an HTML row representing an executor */
-  private def execRow(values: Map[String, String]): Seq[Node] = {
-    val maximumMemory = values("Maximum Memory")
-    val memoryUsed = values("Memory Used")
-    val diskUsed = values("Disk Used")
-    // scalastyle:off
+  private def execRow(info: ExecutorSummaryInfo): Seq[Node] = {
+    val maximumMemory = info.maxMemory
+    val memoryUsed = info.memoryUsed
+    val diskUsed = info.diskUsed
     <tr>
-      <td>{values("Executor ID")}</td>
-      <td>{values("Address")}</td>
-      <td>{values("RDD Blocks")}</td>
-      <td sorttable_customkey={memoryUsed}>
-        {Utils.bytesToString(memoryUsed.toLong)} /
-        {Utils.bytesToString(maximumMemory.toLong)}
+      <td>{info.id}</td>
+      <td>{info.hostPort}</td>
+      <td>{info.rddBlocks}</td>
+      <td sorttable_customkey={memoryUsed.toString}>
+        {Utils.bytesToString(memoryUsed)} /
+        {Utils.bytesToString(maximumMemory)}
+      </td>
+      <td sorttable_customkey={diskUsed.toString}>
+        {Utils.bytesToString(diskUsed)}
+      </td>
+      <td>{info.activeTasks}</td>
+      <td>{info.failedTasks}</td>
+      <td>{info.completedTasks}</td>
+      <td>{info.totalTasks}</td>
+      <td sorttable_customkey={info.totalDuration.toString}>
+        {Utils.msDurationToString(info.totalDuration)}
       </td>
-      <td sorttable_customkey={diskUsed}>
-        {Utils.bytesToString(diskUsed.toLong)}
+      <td sorttable_customkey={info.totalInputBytes.toString}>
+        {Utils.bytesToString(info.totalInputBytes)}
+      </td>
+      <td sorttable_customkey={info.totalShuffleRead.toString}>
+        {Utils.bytesToString(info.totalShuffleRead)}
+      </td>
+      <td sorttable_customkey={info.totalShuffleWrite.toString}>
+        {Utils.bytesToString(info.totalShuffleWrite)}
       </td>
-      <td>{values("Active Tasks")}</td>
-      <td>{values("Failed Tasks")}</td>
-      <td>{values("Complete Tasks")}</td>
-      <td>{values("Total Tasks")}</td>
-      <td sorttable_customkey={values("Task Time")}>{Utils.msDurationToString(values("Task Time").toLong)}</td>
-      <td sorttable_customkey={values("Input Bytes")}>{Utils.bytesToString(values("Input Bytes").toLong)}</td>
-      <td sorttable_customkey={values("Shuffle Read")}>{Utils.bytesToString(values("Shuffle Read").toLong)}</td>
-      <td sorttable_customkey={values("Shuffle Write")} >{Utils.bytesToString(values("Shuffle Write").toLong)}</td>
     </tr>
-    // scalastyle:on
   }
 
   /** Represent an executor's info as a map given a storage status index */
-  private def getExecInfo(statusId: Int): Map[String, String] = {
+  private def getExecInfo(statusId: Int): ExecutorSummaryInfo = {
     val status = listener.storageStatusList(statusId)
     val execId = status.blockManagerId.executorId
     val hostPort = status.blockManagerId.hostPort
-    val rddBlocks = status.blocks.size
+    val rddBlocks = status.numBlocks
     val memUsed = status.memUsed
     val maxMem = status.maxMem
     val diskUsed = status.diskUsed
@@ -118,15 +153,12 @@ private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
     val failedTasks = listener.executorToTasksFailed.getOrElse(execId, 0)
     val completedTasks = listener.executorToTasksComplete.getOrElse(execId, 0)
     val totalTasks = activeTasks + failedTasks + completedTasks
-    val totalDuration = listener.executorToDuration.getOrElse(execId, 0)
-    val totalInputBytes = listener.executorToInputBytes.getOrElse(execId, 0)
-    val totalShuffleRead = listener.executorToShuffleRead.getOrElse(execId, 0)
-    val totalShuffleWrite = listener.executorToShuffleWrite.getOrElse(execId, 0)
+    val totalDuration = listener.executorToDuration.getOrElse(execId, 0L)
+    val totalInputBytes = listener.executorToInputBytes.getOrElse(execId, 0L)
+    val totalShuffleRead = listener.executorToShuffleRead.getOrElse(execId, 0L)
+    val totalShuffleWrite = listener.executorToShuffleWrite.getOrElse(execId, 0L)
 
-    // Also include fields not in the header
-    val execFields = execHeader ++ Seq("Maximum Memory")
-
-    val execValues = Seq(
+    new ExecutorSummaryInfo(
       execId,
       hostPort,
       rddBlocks,
@@ -141,8 +173,6 @@ private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
       totalShuffleRead,
       totalShuffleWrite,
       maxMem
-    ).map(_.toString)
-
-    execFields.zip(execValues).toMap
+    )
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index 2a34a9af925d6..0cc51c873727d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -20,7 +20,8 @@ package org.apache.spark.ui.jobs
 import scala.collection.mutable
 import scala.xml.Node
 
-import org.apache.spark.ui.UIUtils
+import org.apache.spark.ui.{ToolTips, UIUtils}
+import org.apache.spark.ui.jobs.UIData.StageUIData
 import org.apache.spark.util.Utils
 
 /** Page showing executor summary */
@@ -35,7 +36,7 @@ private[ui] class ExecutorTable(stageId: Int, parent: JobProgressTab) {
 
   /** Special table which merges two header cells. */
   private def executorTable[T](): Seq[Node] = {
-    <table class="table table-bordered table-striped table-condensed sortable">
+    <table class={UIUtils.TABLE_CLASS}>
       <thead>
         <th>Executor ID</th>
         <th>Address</th>
@@ -43,9 +44,9 @@ private[ui] class ExecutorTable(stageId: Int, parent: JobProgressTab) {
         <th>Total Tasks</th>
         <th>Failed Tasks</th>
         <th>Succeeded Tasks</th>
-        <th>Input Bytes</th>
-        <th>Shuffle Read</th>
-        <th>Shuffle Write</th>
+        <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
+        <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
+        <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_WRITE}>Shuffle Write</span></th>
         <th>Shuffle Spill (Memory)</th>
         <th>Shuffle Spill (Disk)</th>
       </thead>
@@ -64,11 +65,9 @@ private[ui] class ExecutorTable(stageId: Int, parent: JobProgressTab) {
       executorIdToAddress.put(executorId, address)
     }
 
-    val executorIdToSummary = listener.stageIdToExecutorSummaries.get(stageId)
-    executorIdToSummary match {
-      case Some(x) =>
-        x.toSeq.sortBy(_._1).map { case (k, v) => {
-          // scalastyle:off
+    listener.stageIdToData.get(stageId) match {
+      case Some(stageData: StageUIData) =>
+        stageData.executorSummary.toSeq.sortBy(_._1).map { case (k, v) =>
           <tr>
             <td>{k}</td>
             <td>{executorIdToAddress.getOrElse(k, "CANNOT FIND ADDRESS")}</td>
@@ -76,16 +75,20 @@ private[ui] class ExecutorTable(stageId: Int, parent: JobProgressTab) {
             <td>{v.failedTasks + v.succeededTasks}</td>
             <td>{v.failedTasks}</td>
             <td>{v.succeededTasks}</td>
-            <td sorttable_customekey={v.inputBytes.toString}>{Utils.bytesToString(v.inputBytes)}</td>
-            <td sorttable_customekey={v.shuffleRead.toString}>{Utils.bytesToString(v.shuffleRead)}</td>
-            <td sorttable_customekey={v.shuffleWrite.toString}>{Utils.bytesToString(v.shuffleWrite)}</td>
-            <td sorttable_customekey={v.memoryBytesSpilled.toString} >{Utils.bytesToString(v.memoryBytesSpilled)}</td>
-            <td sorttable_customekey={v.diskBytesSpilled.toString} >{Utils.bytesToString(v.diskBytesSpilled)}</td>
+            <td sorttable_customekey={v.inputBytes.toString}>
+              {Utils.bytesToString(v.inputBytes)}</td>
+            <td sorttable_customekey={v.shuffleRead.toString}>
+              {Utils.bytesToString(v.shuffleRead)}</td>
+            <td sorttable_customekey={v.shuffleWrite.toString}>
+              {Utils.bytesToString(v.shuffleWrite)}</td>
+            <td sorttable_customekey={v.memoryBytesSpilled.toString}>
+              {Utils.bytesToString(v.memoryBytesSpilled)}</td>
+            <td sorttable_customekey={v.diskBytesSpilled.toString}>
+              {Utils.bytesToString(v.diskBytesSpilled)}</td>
           </tr>
-          // scalastyle:on
         }
-      }
-      case _ => Seq[Node]()
+      case None =>
+        Seq.empty[Node]
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 9a4b9a2e84692..6a5cfbe7906ec 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -25,6 +25,7 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.ui.jobs.UIData._
 
 /**
  * :: DeveloperApi ::
@@ -35,7 +36,7 @@ import org.apache.spark.storage.BlockManagerId
  * updating the internal data structures concurrently.
  */
 @DeveloperApi
-class JobProgressListener(conf: SparkConf) extends SparkListener {
+class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
 
   import JobProgressListener._
 
@@ -46,21 +47,8 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
   val completedStages = ListBuffer[StageInfo]()
   val failedStages = ListBuffer[StageInfo]()
 
-  // TODO: Should probably consolidate all following into a single hash map.
-  val stageIdToTime = HashMap[Int, Long]()
-  val stageIdToAccumulables = HashMap[Int, Map[Long, AccumulableInfo]]()
-  val stageIdToInputBytes = HashMap[Int, Long]()
-  val stageIdToShuffleRead = HashMap[Int, Long]()
-  val stageIdToShuffleWrite = HashMap[Int, Long]()
-  val stageIdToMemoryBytesSpilled = HashMap[Int, Long]()
-  val stageIdToDiskBytesSpilled = HashMap[Int, Long]()
-  val stageIdToTasksActive = HashMap[Int, HashMap[Long, TaskInfo]]()
-  val stageIdToTasksComplete = HashMap[Int, Int]()
-  val stageIdToTasksFailed = HashMap[Int, Int]()
-  val stageIdToTaskData = HashMap[Int, HashMap[Long, TaskUIData]]()
-  val stageIdToExecutorSummaries = HashMap[Int, HashMap[String, ExecutorSummary]]()
-  val stageIdToPool = HashMap[Int, String]()
-  val stageIdToDescription = HashMap[Int, String]()
+  val stageIdToData = new HashMap[Int, StageUIData]
+
   val poolToActiveStages = HashMap[String, HashMap[Int, StageInfo]]()
 
   val executorIdToBlockManagerId = HashMap[String, BlockManagerId]()
@@ -72,15 +60,17 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) = synchronized {
     val stage = stageCompleted.stageInfo
     val stageId = stage.stageId
-    // Remove by stageId, rather than by StageInfo, in case the StageInfo is from storage
-    poolToActiveStages(stageIdToPool(stageId)).remove(stageId)
 
-    val emptyMap = HashMap[Long, AccumulableInfo]()
-    val accumulables = stageIdToAccumulables.getOrElseUpdate(stageId, emptyMap)
+    val stageData = stageIdToData.getOrElseUpdate(stageId, {
+      logWarning("Stage completed for unknown stage " + stageId)
+      new StageUIData
+    })
+
     for ((id, info) <- stageCompleted.stageInfo.accumulables) {
-      accumulables(id) = info
+      stageData.accumulables(id) = info
     }
 
+    poolToActiveStages.get(stageData.schedulingPool).foreach(_.remove(stageId))
     activeStages.remove(stageId)
     if (stage.failureReason.isEmpty) {
       completedStages += stage
@@ -95,22 +85,7 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
   private def trimIfNecessary(stages: ListBuffer[StageInfo]) = synchronized {
     if (stages.size > retainedStages) {
       val toRemove = math.max(retainedStages / 10, 1)
-      stages.take(toRemove).foreach { s =>
-        stageIdToTime.remove(s.stageId)
-        stageIdToAccumulables.remove(s.stageId)
-        stageIdToInputBytes.remove(s.stageId)
-        stageIdToShuffleRead.remove(s.stageId)
-        stageIdToShuffleWrite.remove(s.stageId)
-        stageIdToMemoryBytesSpilled.remove(s.stageId)
-        stageIdToDiskBytesSpilled.remove(s.stageId)
-        stageIdToTasksActive.remove(s.stageId)
-        stageIdToTasksComplete.remove(s.stageId)
-        stageIdToTasksFailed.remove(s.stageId)
-        stageIdToTaskData.remove(s.stageId)
-        stageIdToExecutorSummaries.remove(s.stageId)
-        stageIdToPool.remove(s.stageId)
-        stageIdToDescription.remove(s.stageId)
-      }
+      stages.take(toRemove).foreach { s => stageIdToData.remove(s.stageId) }
       stages.trimStart(toRemove)
     }
   }
@@ -123,26 +98,27 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
     val poolName = Option(stageSubmitted.properties).map {
       p => p.getProperty("spark.scheduler.pool", DEFAULT_POOL_NAME)
     }.getOrElse(DEFAULT_POOL_NAME)
-    stageIdToPool(stage.stageId) = poolName
 
-    val description = Option(stageSubmitted.properties).flatMap {
+    val stageData = stageIdToData.getOrElseUpdate(stage.stageId, new StageUIData)
+    stageData.schedulingPool = poolName
+
+    stageData.description = Option(stageSubmitted.properties).flatMap {
       p => Option(p.getProperty(SparkContext.SPARK_JOB_DESCRIPTION))
     }
-    description.map(d => stageIdToDescription(stage.stageId) = d)
 
     val stages = poolToActiveStages.getOrElseUpdate(poolName, new HashMap[Int, StageInfo]())
     stages(stage.stageId) = stage
   }
 
   override def onTaskStart(taskStart: SparkListenerTaskStart) = synchronized {
-    val sid = taskStart.stageId
     val taskInfo = taskStart.taskInfo
     if (taskInfo != null) {
-      val tasksActive = stageIdToTasksActive.getOrElseUpdate(sid, new HashMap[Long, TaskInfo]())
-      tasksActive(taskInfo.taskId) = taskInfo
-      val taskMap = stageIdToTaskData.getOrElse(sid, HashMap[Long, TaskUIData]())
-      taskMap(taskInfo.taskId) = new TaskUIData(taskInfo)
-      stageIdToTaskData(sid) = taskMap
+      val stageData = stageIdToData.getOrElseUpdate(taskStart.stageId, {
+        logWarning("Task start for unknown stage " + taskStart.stageId)
+        new StageUIData
+      })
+      stageData.numActiveTasks += 1
+      stageData.taskData.put(taskInfo.taskId, new TaskUIData(taskInfo))
     }
   }
 
@@ -152,10 +128,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
   }
 
   override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
-    val sid = taskEnd.stageId
     val info = taskEnd.taskInfo
-
     if (info != null) {
+<<<<<<< HEAD
       val emptyMap = HashMap[Long, AccumulableInfo]()
       val accumulables = stageIdToAccumulables.getOrElseUpdate(sid, emptyMap)
       for (accumulableInfo <- info.accumulables) {
@@ -191,53 +166,105 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
           }
         }
         case _ => {}
+=======
+      val stageData = stageIdToData.getOrElseUpdate(taskEnd.stageId, {
+        logWarning("Task end for unknown stage " + taskEnd.stageId)
+        new StageUIData
+      })
+
+      val execSummaryMap = stageData.executorSummary
+      val execSummary = execSummaryMap.getOrElseUpdate(info.executorId, new ExecutorSummary)
+
+      taskEnd.reason match {
+        case Success =>
+          execSummary.succeededTasks += 1
+        case _ =>
+          execSummary.failedTasks += 1
+>>>>>>> apache/master
       }
-
-      val tasksActive = stageIdToTasksActive.getOrElseUpdate(sid, new HashMap[Long, TaskInfo]())
-      // Remove by taskId, rather than by TaskInfo, in case the TaskInfo is from storage
-      tasksActive.remove(info.taskId)
+      execSummary.taskTime += info.duration
+      stageData.numActiveTasks -= 1
 
       val (errorMessage, metrics): (Option[String], Option[TaskMetrics]) =
         taskEnd.reason match {
           case org.apache.spark.Success =>
-            stageIdToTasksComplete(sid) = stageIdToTasksComplete.getOrElse(sid, 0) + 1
+            stageData.numCompleteTasks += 1
             (None, Option(taskEnd.taskMetrics))
           case e: ExceptionFailure =>  // Handle ExceptionFailure because we might have metrics
-            stageIdToTasksFailed(sid) = stageIdToTasksFailed.getOrElse(sid, 0) + 1
+            stageData.numFailedTasks += 1
             (Some(e.toErrorString), e.metrics)
           case e: TaskFailedReason =>  // All other failure cases
-            stageIdToTasksFailed(sid) = stageIdToTasksFailed.getOrElse(sid, 0) + 1
+            stageData.numFailedTasks += 1
             (Some(e.toErrorString), None)
         }
 
-      stageIdToTime.getOrElseUpdate(sid, 0L)
-      val time = metrics.map(_.executorRunTime).getOrElse(0L)
-      stageIdToTime(sid) += time
-
-      stageIdToInputBytes.getOrElseUpdate(sid, 0L)
-      val inputBytes = metrics.flatMap(_.inputMetrics).map(_.bytesRead).getOrElse(0L)
-      stageIdToInputBytes(sid) += inputBytes
-
-      stageIdToShuffleRead.getOrElseUpdate(sid, 0L)
-      val shuffleRead = metrics.flatMap(_.shuffleReadMetrics).map(_.remoteBytesRead).getOrElse(0L)
-      stageIdToShuffleRead(sid) += shuffleRead
-
-      stageIdToShuffleWrite.getOrElseUpdate(sid, 0L)
-      val shuffleWrite =
-        metrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleBytesWritten).getOrElse(0L)
-      stageIdToShuffleWrite(sid) += shuffleWrite
+      if (!metrics.isEmpty) {
+        val oldMetrics = stageData.taskData.get(info.taskId).flatMap(_.taskMetrics)
+        updateAggregateMetrics(stageData, info.executorId, metrics.get, oldMetrics)
+      }
 
-      stageIdToMemoryBytesSpilled.getOrElseUpdate(sid, 0L)
-      val memoryBytesSpilled = metrics.map(_.memoryBytesSpilled).getOrElse(0L)
-      stageIdToMemoryBytesSpilled(sid) += memoryBytesSpilled
+      val taskData = stageData.taskData.getOrElseUpdate(info.taskId, new TaskUIData(info))
+      taskData.taskInfo = info
+      taskData.taskMetrics = metrics
+      taskData.errorMessage = errorMessage
+    }
+  }
 
-      stageIdToDiskBytesSpilled.getOrElseUpdate(sid, 0L)
-      val diskBytesSpilled = metrics.map(_.diskBytesSpilled).getOrElse(0L)
-      stageIdToDiskBytesSpilled(sid) += diskBytesSpilled
+  /**
+   * Upon receiving new metrics for a task, updates the per-stage and per-executor-per-stage
+   * aggregate metrics by calculating deltas between the currently recorded metrics and the new
+   * metrics.
+   */
+  def updateAggregateMetrics(
+      stageData: StageUIData,
+      execId: String,
+      taskMetrics: TaskMetrics,
+      oldMetrics: Option[TaskMetrics]) {
+    val execSummary = stageData.executorSummary.getOrElseUpdate(execId, new ExecutorSummary)
+
+    val shuffleWriteDelta =
+      (taskMetrics.shuffleWriteMetrics.map(_.shuffleBytesWritten).getOrElse(0L)
+      - oldMetrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleBytesWritten).getOrElse(0L))
+    stageData.shuffleWriteBytes += shuffleWriteDelta
+    execSummary.shuffleWrite += shuffleWriteDelta
+
+    val shuffleReadDelta =
+      (taskMetrics.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L)
+      - oldMetrics.flatMap(_.shuffleReadMetrics).map(_.remoteBytesRead).getOrElse(0L))
+    stageData.shuffleReadBytes += shuffleReadDelta
+    execSummary.shuffleRead += shuffleReadDelta
+
+    val diskSpillDelta =
+      taskMetrics.diskBytesSpilled - oldMetrics.map(_.diskBytesSpilled).getOrElse(0L)
+    stageData.diskBytesSpilled += diskSpillDelta
+    execSummary.diskBytesSpilled += diskSpillDelta
+
+    val memorySpillDelta =
+      taskMetrics.memoryBytesSpilled - oldMetrics.map(_.memoryBytesSpilled).getOrElse(0L)
+    stageData.memoryBytesSpilled += memorySpillDelta
+    execSummary.memoryBytesSpilled += memorySpillDelta
+
+    val timeDelta =
+      taskMetrics.executorRunTime - oldMetrics.map(_.executorRunTime).getOrElse(0L)
+    stageData.executorRunTime += timeDelta
+  }
 
-      val taskMap = stageIdToTaskData.getOrElse(sid, HashMap[Long, TaskUIData]())
-      taskMap(info.taskId) = new TaskUIData(info, metrics, errorMessage)
-      stageIdToTaskData(sid) = taskMap
+  override def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate) {
+    for ((taskId, sid, taskMetrics) <- executorMetricsUpdate.taskMetrics) {
+      val stageData = stageIdToData.getOrElseUpdate(sid, {
+        logWarning("Metrics update for task in unknown stage " + sid)
+        new StageUIData
+      })
+      val taskData = stageData.taskData.get(taskId)
+      taskData.map { t =>
+        if (!t.taskInfo.finished) {
+          updateAggregateMetrics(stageData, executorMetricsUpdate.execId, taskMetrics,
+            t.taskMetrics)
+
+          // Overwrite task metrics
+          t.taskMetrics = Some(taskMetrics)
+        }
+      }
     }
   }
 
@@ -267,12 +294,6 @@ class JobProgressListener(conf: SparkConf) extends SparkListener {
 
 }
 
-@DeveloperApi
-case class TaskUIData(
-    taskInfo: TaskInfo,
-    taskMetrics: Option[TaskMetrics] = None,
-    errorMessage: Option[String] = None)
-
 private object JobProgressListener {
   val DEFAULT_POOL_NAME = "default"
   val DEFAULT_RETAINED_STAGES = 1000
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 6d3262512926a..52e7fd5a2856d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -22,7 +22,8 @@ import javax.servlet.http.HttpServletRequest
 
 import scala.xml.{Node, Unparsed}
 
-import org.apache.spark.ui.{WebUIPage, UIUtils}
+import org.apache.spark.ui.{ToolTips, WebUIPage, UIUtils}
+import org.apache.spark.ui.jobs.UIData._
 import org.apache.spark.util.{Utils, Distribution}
 import org.apache.spark.scheduler.AccumulableInfo
 
@@ -35,8 +36,9 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
   def render(request: HttpServletRequest): Seq[Node] = {
     listener.synchronized {
       val stageId = request.getParameter("id").toInt
+      val stageDataOption = listener.stageIdToData.get(stageId)
 
-      if (!listener.stageIdToTaskData.contains(stageId)) {
+      if (stageDataOption.isEmpty || stageDataOption.get.taskData.isEmpty) {
         val content =
           <div>
             <h4>Summary Metrics</h4> No tasks have started yet
@@ -46,24 +48,15 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
           "Details for Stage %s".format(stageId), parent.headerTabs, parent)
       }
 
-      val tasks = listener.stageIdToTaskData(stageId).values.toSeq.sortBy(_.taskInfo.launchTime)
+      val stageData = stageDataOption.get
+      val tasks = stageData.taskData.values.toSeq.sortBy(_.taskInfo.launchTime)
 
       val numCompleted = tasks.count(_.taskInfo.finished)
-      val inputBytes = listener.stageIdToInputBytes.getOrElse(stageId, 0L)
-      val hasInput = inputBytes > 0
-      val shuffleReadBytes = listener.stageIdToShuffleRead.getOrElse(stageId, 0L)
-      val hasShuffleRead = shuffleReadBytes > 0
-      val shuffleWriteBytes = listener.stageIdToShuffleWrite.getOrElse(stageId, 0L)
-      val hasShuffleWrite = shuffleWriteBytes > 0
-      val memoryBytesSpilled = listener.stageIdToMemoryBytesSpilled.getOrElse(stageId, 0L)
-      val diskBytesSpilled = listener.stageIdToDiskBytesSpilled.getOrElse(stageId, 0L)
-      val hasBytesSpilled = memoryBytesSpilled > 0 && diskBytesSpilled > 0
       val accumulables = listener.stageIdToAccumulables(stageId)
-
-      var activeTime = 0L
-      val now = System.currentTimeMillis
-      val tasksActive = listener.stageIdToTasksActive(stageId).values
-      tasksActive.foreach(activeTime += _.timeRunning(now))
+      val hasInput = stageData.inputBytes > 0
+      val hasShuffleRead = stageData.shuffleReadBytes > 0
+      val hasShuffleWrite = stageData.shuffleWriteBytes > 0
+      val hasBytesSpilled = stageData.memoryBytesSpilled > 0 && stageData.diskBytesSpilled > 0
 
       // scalastyle:off
       val summary =
@@ -71,34 +64,34 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
           <ul class="unstyled">
             <li>
               <strong>Total task time across all tasks: </strong>
-              {UIUtils.formatDuration(listener.stageIdToTime.getOrElse(stageId, 0L) + activeTime)}
+              {UIUtils.formatDuration(stageData.executorRunTime)}
             </li>
             {if (hasInput)
               <li>
                 <strong>Input: </strong>
-                {Utils.bytesToString(inputBytes)}
+                {Utils.bytesToString(stageData.inputBytes)}
               </li>
             }
             {if (hasShuffleRead)
               <li>
                 <strong>Shuffle read: </strong>
-                {Utils.bytesToString(shuffleReadBytes)}
+                {Utils.bytesToString(stageData.shuffleReadBytes)}
               </li>
             }
             {if (hasShuffleWrite)
               <li>
                 <strong>Shuffle write: </strong>
-                {Utils.bytesToString(shuffleWriteBytes)}
+                {Utils.bytesToString(stageData.shuffleWriteBytes)}
               </li>
             }
             {if (hasBytesSpilled)
             <li>
               <strong>Shuffle spill (memory): </strong>
-              {Utils.bytesToString(memoryBytesSpilled)}
+              {Utils.bytesToString(stageData.memoryBytesSpilled)}
             </li>
             <li>
               <strong>Shuffle spill (disk): </strong>
-              {Utils.bytesToString(diskBytesSpilled)}
+              {Utils.bytesToString(stageData.diskBytesSpilled)}
             </li>
             }
           </ul>
@@ -134,14 +127,14 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
             metrics.get.resultSerializationTime.toDouble
           }
           val serializationQuantiles =
-            "Result serialization time" +: Distribution(serializationTimes).
-              get.getQuantiles().map(ms => UIUtils.formatDuration(ms.toLong))
+            <td>Result serialization time</td> +: Distribution(serializationTimes).
+              get.getQuantiles().map(ms => <td>{UIUtils.formatDuration(ms.toLong)}</td>)
 
           val serviceTimes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.executorRunTime.toDouble
           }
-          val serviceQuantiles = "Duration" +: Distribution(serviceTimes).get.getQuantiles()
-            .map(ms => UIUtils.formatDuration(ms.toLong))
+          val serviceQuantiles = <td>Duration</td> +: Distribution(serviceTimes).get.getQuantiles()
+            .map(ms => <td>{UIUtils.formatDuration(ms.toLong)}</td>)
 
           val gettingResultTimes = validTasks.map { case TaskUIData(info, _, _) =>
             if (info.gettingResultTime > 0) {
@@ -150,9 +143,9 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
               0.0
             }
           }
-          val gettingResultQuantiles = "Time spent fetching task results" +:
+          val gettingResultQuantiles = <td>Time spent fetching task results</td> +:
             Distribution(gettingResultTimes).get.getQuantiles().map { millis =>
-              UIUtils.formatDuration(millis.toLong)
+              <td>{UIUtils.formatDuration(millis.toLong)}</td>
             }
           // The scheduler delay includes the network delay to send the task to the worker
           // machine and to send back the result (but not the time to fetch the task result,
@@ -167,42 +160,45 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
             }
             totalExecutionTime - metrics.get.executorRunTime
           }
-          val schedulerDelayQuantiles = "Scheduler delay" +:
+          val schedulerDelayTitle = <td><span data-toggle="tooltip"
+            title={ToolTips.SCHEDULER_DELAY} data-placement="right">Scheduler delay</span></td>
+          val schedulerDelayQuantiles = schedulerDelayTitle +:
             Distribution(schedulerDelays).get.getQuantiles().map { millis =>
-              UIUtils.formatDuration(millis.toLong)
+              <td>{UIUtils.formatDuration(millis.toLong)}</td>
             }
 
           def getQuantileCols(data: Seq[Double]) =
-            Distribution(data).get.getQuantiles().map(d => Utils.bytesToString(d.toLong))
+            Distribution(data).get.getQuantiles().map(d => <td>{Utils.bytesToString(d.toLong)}</td>)
 
           val inputSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.inputMetrics.map(_.bytesRead).getOrElse(0L).toDouble
           }
-          val inputQuantiles = "Input" +: getQuantileCols(inputSizes)
+          val inputQuantiles = <td>Input</td> +: getQuantileCols(inputSizes)
 
           val shuffleReadSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L).toDouble
           }
-          val shuffleReadQuantiles = "Shuffle Read (Remote)" +: getQuantileCols(shuffleReadSizes)
+          val shuffleReadQuantiles = <td>Shuffle Read (Remote)</td> +:
+            getQuantileCols(shuffleReadSizes)
 
           val shuffleWriteSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleWriteMetrics.map(_.shuffleBytesWritten).getOrElse(0L).toDouble
           }
-          val shuffleWriteQuantiles = "Shuffle Write" +: getQuantileCols(shuffleWriteSizes)
+          val shuffleWriteQuantiles = <td>Shuffle Write</td> +: getQuantileCols(shuffleWriteSizes)
 
           val memoryBytesSpilledSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.memoryBytesSpilled.toDouble
           }
-          val memoryBytesSpilledQuantiles = "Shuffle spill (memory)" +:
+          val memoryBytesSpilledQuantiles = <td>Shuffle spill (memory)</td> +:
             getQuantileCols(memoryBytesSpilledSizes)
 
           val diskBytesSpilledSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.diskBytesSpilled.toDouble
           }
-          val diskBytesSpilledQuantiles = "Shuffle spill (disk)" +:
+          val diskBytesSpilledQuantiles = <td>Shuffle spill (disk)</td> +:
             getQuantileCols(diskBytesSpilledSizes)
 
-          val listings: Seq[Seq[String]] = Seq(
+          val listings: Seq[Seq[Node]] = Seq(
             serializationQuantiles,
             serviceQuantiles,
             gettingResultQuantiles,
@@ -215,7 +211,7 @@ private[ui] class StagePage(parent: JobProgressTab) extends WebUIPage("stage") {
 
           val quantileHeaders = Seq("Metric", "Min", "25th percentile",
             "Median", "75th percentile", "Max")
-          def quantileRow(data: Seq[String]): Seq[Node] = <tr> {data.map(d => <td>{d}</td>)} </tr>
+          def quantileRow(data: Seq[Node]): Seq[Node] = <tr>{data}</tr>
           Some(UIUtils.listingTable(quantileHeaders, quantileRow, listings, fixedWidth = true))
         }
       val executorTable = new ExecutorTable(stageId, parent)
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index a9ac6d5bee9c9..3dcfaf76e4aba 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.ui.jobs
 
-import java.util.Date
-
-import scala.collection.mutable.HashMap
 import scala.xml.Node
+import scala.xml.Text
+
+import java.util.Date
 
-import org.apache.spark.scheduler.{StageInfo, TaskInfo}
-import org.apache.spark.ui.UIUtils
+import org.apache.spark.scheduler.StageInfo
+import org.apache.spark.ui.{ToolTips, UIUtils}
 import org.apache.spark.util.Utils
 
 /** Page showing list of all ongoing and recently finished stages */
@@ -43,9 +43,16 @@ private[ui] class StageTableBase(
     <th>Submitted</th>
     <th>Duration</th>
     <th>Tasks: Succeeded/Total</th>
-    <th>Input</th>
-    <th>Shuffle Read</th>
-    <th>Shuffle Write</th>
+    <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
+    <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
+    <th>
+      <!-- Place the shuffle write tooltip on the left (rather than the default position
+        of on top) because the shuffle write column is the last column on the right side and
+        the tooltip is wider than the column, so it doesn't fit on top. -->
+      <span data-toggle="tooltip" data-placement="left" title={ToolTips.SHUFFLE_WRITE}>
+        Shuffle Write
+      </span>
+    </th>
   }
 
   def toNodeSeq: Seq[Node] = {
@@ -64,14 +71,14 @@ private[ui] class StageTableBase(
     </table>
   }
 
-  private def makeProgressBar(started: Int, completed: Int, failed: String, total: Int): Seq[Node] =
+  private def makeProgressBar(started: Int, completed: Int, failed: Int, total: Int): Seq[Node] =
   {
     val completeWidth = "width: %s%%".format((completed.toDouble/total)*100)
     val startWidth = "width: %s%%".format((started.toDouble/total)*100)
 
     <div class="progress">
       <span style="text-align:center; position:absolute; width:100%; left:0;">
-        {completed}/{total} {failed}
+        {completed}/{total} { if (failed > 0) s"($failed failed)" else "" }
       </span>
       <div class="bar bar-completed" style={completeWidth}></div>
       <div class="bar bar-running" style={startWidth}></div>
@@ -82,7 +89,8 @@ private[ui] class StageTableBase(
     // scalastyle:off
     val killLink = if (killEnabled) {
       <span class="kill-link">
-        (<a href={"%s/stages/stage/kill?id=%s&terminate=true".format(UIUtils.prependBaseUri(basePath), s.stageId)}>kill</a>)
+        (<a href={"%s/stages/stage/kill?id=%s&terminate=true".format(UIUtils.prependBaseUri(basePath), s.stageId)}
+            onclick={"return window.confirm('Are you sure you want to kill stage %s ?');".format(s.stageId)}>kill</a>)
       </span>
     }
     // scalastyle:on
@@ -92,21 +100,42 @@ private[ui] class StageTableBase(
         {s.name}
       </a>
 
+    val cachedRddInfos = s.rddInfos.filter(_.numCachedPartitions > 0)
     val details = if (s.details.nonEmpty) {
       <span onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
             class="expand-details">
-        +show details
-      </span>
-      <pre class="stage-details collapsed">{s.details}</pre>
+        +details
+      </span> ++
+      <div class="stage-details collapsed">
+        {if (cachedRddInfos.nonEmpty) {
+          Text("RDD: ") ++
+          // scalastyle:off
+          cachedRddInfos.map { i =>
+            <a href={"%s/storage/rdd?id=%d".format(UIUtils.prependBaseUri(basePath), i.id)}>{i.name}</a>
+          }
+          // scalastyle:on
+        }}
+        <pre>{s.details}</pre>
+      </div>
+    }
+
+    val stageDesc = for {
+      stageData <- listener.stageIdToData.get(s.stageId)
+      desc <- stageData.description
+    } yield {
+      <div><em>{desc}</em></div>
     }
 
-    listener.stageIdToDescription.get(s.stageId)
-      .map(d => <div><em>{d}</em></div><div>{nameLink} {killLink}</div>)
-      .getOrElse(<div>{killLink} {nameLink} {details}</div>)
+    <div>{stageDesc.getOrElse("")} {killLink} {nameLink} {details}</div>
   }
 
   protected def stageRow(s: StageInfo): Seq[Node] = {
-    val poolName = listener.stageIdToPool.get(s.stageId)
+    val stageDataOption = listener.stageIdToData.get(s.stageId)
+    if (stageDataOption.isEmpty) {
+      return <td>{s.stageId}</td><td>No data available for this stage</td>
+    }
+
+    val stageData = stageDataOption.get
     val submissionTime = s.submissionTime match {
       case Some(t) => UIUtils.formatDate(new Date(t))
       case None => "Unknown"
@@ -116,35 +145,20 @@ private[ui] class StageTableBase(
       if (finishTime > t) finishTime - t else System.currentTimeMillis - t
     }
     val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
-    val startedTasks =
-      listener.stageIdToTasksActive.getOrElse(s.stageId, HashMap[Long, TaskInfo]()).size
-    val completedTasks = listener.stageIdToTasksComplete.getOrElse(s.stageId, 0)
-    val failedTasks = listener.stageIdToTasksFailed.getOrElse(s.stageId, 0) match {
-      case f if f > 0 => "(%s failed)".format(f)
-      case _ => ""
-    }
-    val totalTasks = s.numTasks
-    val inputSortable = listener.stageIdToInputBytes.getOrElse(s.stageId, 0L)
-    val inputRead = inputSortable match {
-      case 0 => ""
-      case b => Utils.bytesToString(b)
-    }
-    val shuffleReadSortable = listener.stageIdToShuffleRead.getOrElse(s.stageId, 0L)
-    val shuffleRead = shuffleReadSortable match {
-      case 0 => ""
-      case b => Utils.bytesToString(b)
-    }
-    val shuffleWriteSortable = listener.stageIdToShuffleWrite.getOrElse(s.stageId, 0L)
-    val shuffleWrite = shuffleWriteSortable match {
-      case 0 => ""
-      case b => Utils.bytesToString(b)
-    }
+
+    val inputRead = stageData.inputBytes
+    val inputReadWithUnit = if (inputRead > 0) Utils.bytesToString(inputRead) else ""
+    val shuffleRead = stageData.shuffleReadBytes
+    val shuffleReadWithUnit = if (shuffleRead > 0) Utils.bytesToString(shuffleRead) else ""
+    val shuffleWrite = stageData.shuffleWriteBytes
+    val shuffleWriteWithUnit = if (shuffleWrite > 0) Utils.bytesToString(shuffleWrite) else ""
+
     <td>{s.stageId}</td> ++
     {if (isFairScheduler) {
       <td>
         <a href={"%s/stages/pool?poolname=%s"
-          .format(UIUtils.prependBaseUri(basePath), poolName.get)}>
-          {poolName.get}
+          .format(UIUtils.prependBaseUri(basePath), stageData.schedulingPool)}>
+          {stageData.schedulingPool}
         </a>
       </td>
     } else {
@@ -154,11 +168,12 @@ private[ui] class StageTableBase(
     <td valign="middle">{submissionTime}</td>
     <td sorttable_customkey={duration.getOrElse(-1).toString}>{formattedDuration}</td>
     <td class="progress-cell">
-      {makeProgressBar(startedTasks, completedTasks, failedTasks, totalTasks)}
+      {makeProgressBar(stageData.numActiveTasks, stageData.numCompleteTasks,
+        stageData.numFailedTasks, s.numTasks)}
     </td>
-    <td sorttable_customekey={inputSortable.toString}>{inputRead}</td>
-    <td sorttable_customekey={shuffleReadSortable.toString}>{shuffleRead}</td>
-    <td sorttable_customekey={shuffleWriteSortable.toString}>{shuffleWrite}</td>
+    <td sorttable_customekey={inputRead.toString}>{inputReadWithUnit}</td>
+    <td sorttable_customekey={shuffleRead.toString}>{shuffleReadWithUnit}</td>
+    <td sorttable_customekey={shuffleWrite.toString}>{shuffleWriteWithUnit}</td>
   }
 
   /** Render an HTML row that represents a stage */
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
new file mode 100644
index 0000000000000..44265e289d8f2
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui.jobs
+
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
+
+import scala.collection.mutable.HashMap
+
+private[jobs] object UIData {
+
+  class ExecutorSummary {
+    var taskTime : Long = 0
+    var failedTasks : Int = 0
+    var succeededTasks : Int = 0
+    var inputBytes : Long = 0
+    var shuffleRead : Long = 0
+    var shuffleWrite : Long = 0
+    var memoryBytesSpilled : Long = 0
+    var diskBytesSpilled : Long = 0
+  }
+
+  class StageUIData {
+    var numActiveTasks: Int = _
+    var numCompleteTasks: Int = _
+    var numFailedTasks: Int = _
+
+    var executorRunTime: Long = _
+
+    var inputBytes: Long = _
+    var shuffleReadBytes: Long = _
+    var shuffleWriteBytes: Long = _
+    var memoryBytesSpilled: Long = _
+    var diskBytesSpilled: Long = _
+
+    var schedulingPool: String = ""
+    var description: Option[String] = None
+
+    var accumulables = HashMap[Long, AccumulableInfo]
+    var taskData = new HashMap[Long, TaskUIData]
+    var executorSummary = new HashMap[String, ExecutorSummary]
+  }
+
+  /**
+   * These are kept mutable and reused throughout a task's lifetime to avoid excessive reallocation.
+   */
+  case class TaskUIData(
+      var taskInfo: TaskInfo,
+      var taskMetrics: Option[TaskMetrics] = None,
+      var errorMessage: Option[String] = None)
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index 2155633b8096f..84ac53da47552 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -45,12 +45,13 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
     val workerTable = UIUtils.listingTable(workerHeader, workerRow, workers)
 
     // Block table
-    val filteredStorageStatusList = StorageUtils.filterStorageStatusByRDD(storageStatusList, rddId)
-    val blockStatuses = filteredStorageStatusList.flatMap(_.blocks).sortWith(_._1.name < _._1.name)
-    val blockLocations = StorageUtils.blockLocationsFromStorageStatus(filteredStorageStatusList)
-    val blocks = blockStatuses.map { case (blockId, status) =>
-      (blockId, status, blockLocations.get(blockId).getOrElse(Seq[String]("Unknown")))
-    }
+    val blockLocations = StorageUtils.getRddBlockLocations(rddId, storageStatusList)
+    val blocks = storageStatusList
+      .flatMap(_.rddBlocksById(rddId))
+      .sortWith(_._1.name < _._1.name)
+      .map { case (blockId, status) =>
+        (blockId, status, blockLocations.get(blockId).getOrElse(Seq[String]("Unknown")))
+      }
     val blockTable = UIUtils.listingTable(blockHeader, blockRow, blocks)
 
     val content =
@@ -119,10 +120,10 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
     <tr>
       <td>{status.blockManagerId.host + ":" + status.blockManagerId.port}</td>
       <td>
-        {Utils.bytesToString(status.memUsedByRDD(rddId))}
+        {Utils.bytesToString(status.memUsedByRdd(rddId))}
         ({Utils.bytesToString(status.memRemaining)} Remaining)
       </td>
-      <td>{Utils.bytesToString(status.diskUsedByRDD(rddId))}</td>
+      <td>{Utils.bytesToString(status.diskUsedByRdd(rddId))}</td>
     </tr>
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
index 0cc0cf3117173..5f6740d495521 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
@@ -41,19 +41,18 @@ private[ui] class StorageTab(parent: SparkUI) extends WebUITab(parent, "storage"
  */
 @DeveloperApi
 class StorageListener(storageStatusListener: StorageStatusListener) extends SparkListener {
-  private val _rddInfoMap = mutable.Map[Int, RDDInfo]()
+  private[ui] val _rddInfoMap = mutable.Map[Int, RDDInfo]() // exposed for testing
 
   def storageStatusList = storageStatusListener.storageStatusList
 
   /** Filter RDD info to include only those with cached partitions */
   def rddInfoList = _rddInfoMap.values.filter(_.numCachedPartitions > 0).toSeq
 
-  /** Update each RDD's info to reflect any updates to the RDD's storage status */
-  private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)] = Seq.empty) {
-    val rddInfos = _rddInfoMap.values.toSeq
-    val updatedRddInfos =
-      StorageUtils.rddInfoFromStorageStatus(storageStatusList, rddInfos, updatedBlocks)
-    updatedRddInfos.foreach { info => _rddInfoMap(info.id) = info }
+  /** Update the storage info of the RDDs whose blocks are among the given updated blocks */
+  private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = {
+    val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet
+    val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) }
+    StorageUtils.updateRddInfo(rddInfosToUpdate, storageStatusList)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index 9930c717492f2..feafd654e9e71 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -18,13 +18,16 @@
 package org.apache.spark.util
 
 import scala.collection.JavaConversions.mapAsJavaMap
+import scala.concurrent.Await
 import scala.concurrent.duration.{Duration, FiniteDuration}
 
-import akka.actor.{ActorSystem, ExtendedActorSystem}
+import akka.actor.{Actor, ActorRef, ActorSystem, ExtendedActorSystem}
+import akka.pattern.ask
+
 import com.typesafe.config.ConfigFactory
 import org.apache.log4j.{Level, Logger}
 
-import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.spark.{SparkException, Logging, SecurityManager, SparkConf}
 
 /**
  * Various utility classes for working with Akka.
@@ -124,4 +127,63 @@ private[spark] object AkkaUtils extends Logging {
 
   /** Space reserved for extra data in an Akka message besides serialized task or task result. */
   val reservedSizeBytes = 200 * 1024
+
+  /** Returns the configured number of times to retry connecting */
+  def numRetries(conf: SparkConf): Int = {
+    conf.getInt("spark.akka.num.retries", 3)
+  }
+
+  /** Returns the configured number of milliseconds to wait on each retry */
+  def retryWaitMs(conf: SparkConf): Int = {
+    conf.getInt("spark.akka.retry.wait", 3000)
+  }
+
+  /**
+   * Send a message to the given actor and get its result within a default timeout, or
+   * throw a SparkException if this fails.
+   */
+  def askWithReply[T](
+      message: Any,
+      actor: ActorRef,
+      retryAttempts: Int,
+      retryInterval: Int,
+      timeout: FiniteDuration): T = {
+    // TODO: Consider removing multiple attempts
+    if (actor == null) {
+      throw new SparkException("Error sending message as driverActor is null " +
+        "[message = " + message + "]")
+    }
+    var attempts = 0
+    var lastException: Exception = null
+    while (attempts < retryAttempts) {
+      attempts += 1
+      try {
+        val future = actor.ask(message)(timeout)
+        val result = Await.result(future, timeout)
+        if (result == null) {
+          throw new SparkException("Actor returned null")
+        }
+        return result.asInstanceOf[T]
+      } catch {
+        case ie: InterruptedException => throw ie
+        case e: Exception =>
+          lastException = e
+          logWarning("Error sending message in " + attempts + " attempts", e)
+      }
+      Thread.sleep(retryInterval)
+    }
+
+    throw new SparkException(
+      "Error sending message [message = " + message + "]", lastException)
+  }
+
+  def makeDriverRef(name: String, conf: SparkConf, actorSystem: ActorSystem): ActorRef = {
+    val driverHost: String = conf.get("spark.driver.host", "localhost")
+    val driverPort: Int = conf.getInt("spark.driver.port", 7077)
+    Utils.checkHost(driverHost, "Expected hostname")
+    val url = s"akka.tcp://spark@$driverHost:$driverPort/user/$name"
+    val timeout = AkkaUtils.lookupTimeout(conf)
+    logInfo(s"Connecting to $name: $url")
+    Await.result(actorSystem.actorSelection(url).resolveOne(timeout), timeout)
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala b/core/src/main/scala/org/apache/spark/util/CollectionsUtils.scala
similarity index 88%
rename from core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala
rename to core/src/main/scala/org/apache/spark/util/CollectionsUtils.scala
index e4c254b9dd6b9..85da2842e8ddb 100644
--- a/core/src/main/scala/org/apache/spark/util/CollectionsUtil.scala
+++ b/core/src/main/scala/org/apache/spark/util/CollectionsUtils.scala
@@ -19,11 +19,11 @@ package org.apache.spark.util
 
 import java.util
 
-import scala.Array
-import scala.reflect._
+import scala.reflect.{classTag, ClassTag}
 
 private[spark] object CollectionsUtils {
   def makeBinarySearch[K : Ordering : ClassTag] : (Array[K], K) => Int = {
+    // For primitive keys, we can use the natural ordering. Otherwise, use the Ordering comparator.
     classTag[K] match {
       case ClassTag.Float =>
         (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Float]], x.asInstanceOf[Float])
@@ -40,7 +40,8 @@ private[spark] object CollectionsUtils {
       case ClassTag.Long =>
         (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[Long]], x.asInstanceOf[Long])
       case _ =>
-        (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x)
+        val comparator = implicitly[Ordering[K]].asInstanceOf[java.util.Comparator[Any]]
+        (l, x) => util.Arrays.binarySearch(l.asInstanceOf[Array[AnyRef]], x, comparator)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/FileLogger.scala b/core/src/main/scala/org/apache/spark/util/FileLogger.scala
index 6a95dc06e155d..2e8fbf5a91ee7 100644
--- a/core/src/main/scala/org/apache/spark/util/FileLogger.scala
+++ b/core/src/main/scala/org/apache/spark/util/FileLogger.scala
@@ -52,7 +52,7 @@ private[spark] class FileLogger(
     override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
   }
 
-  private val fileSystem = Utils.getHadoopFileSystem(new URI(logDir))
+  private val fileSystem = Utils.getHadoopFileSystem(logDir)
   var fileIndex = 0
 
   // Only used if compression is enabled
@@ -196,6 +196,5 @@ private[spark] class FileLogger(
   def stop() {
     hadoopDataStream.foreach(_.close())
     writer.foreach(_.close())
-    fileSystem.close()
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 17bbff16e0f6d..cb6eb261cadbe 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -248,7 +248,6 @@ private[spark] object JsonProtocol {
 
   def shuffleReadMetricsToJson(shuffleReadMetrics: ShuffleReadMetrics): JValue = {
     ("Shuffle Finish Time" -> shuffleReadMetrics.shuffleFinishTime) ~
-    ("Total Blocks Fetched" -> shuffleReadMetrics.totalBlocksFetched) ~
     ("Remote Blocks Fetched" -> shuffleReadMetrics.remoteBlocksFetched) ~
     ("Local Blocks Fetched" -> shuffleReadMetrics.localBlocksFetched) ~
     ("Fetch Wait Time" -> shuffleReadMetrics.fetchWaitTime) ~
@@ -269,7 +268,8 @@ private[spark] object JsonProtocol {
     val reason = Utils.getFormattedClassName(taskEndReason)
     val json = taskEndReason match {
       case fetchFailed: FetchFailed =>
-        val blockManagerAddress = blockManagerIdToJson(fetchFailed.bmAddress)
+        val blockManagerAddress = Option(fetchFailed.bmAddress).
+          map(blockManagerIdToJson).getOrElse(JNothing)
         ("Block Manager Address" -> blockManagerAddress) ~
         ("Shuffle ID" -> fetchFailed.shuffleId) ~
         ("Map ID" -> fetchFailed.mapId) ~
@@ -558,8 +558,9 @@ private[spark] object JsonProtocol {
     metrics.resultSerializationTime = (json \ "Result Serialization Time").extract[Long]
     metrics.memoryBytesSpilled = (json \ "Memory Bytes Spilled").extract[Long]
     metrics.diskBytesSpilled = (json \ "Disk Bytes Spilled").extract[Long]
-    metrics.shuffleReadMetrics =
-      Utils.jsonOption(json \ "Shuffle Read Metrics").map(shuffleReadMetricsFromJson)
+    Utils.jsonOption(json \ "Shuffle Read Metrics").map { shuffleReadMetrics =>
+      metrics.updateShuffleReadMetrics(shuffleReadMetricsFromJson(shuffleReadMetrics))
+    }
     metrics.shuffleWriteMetrics =
       Utils.jsonOption(json \ "Shuffle Write Metrics").map(shuffleWriteMetricsFromJson)
     metrics.inputMetrics =
@@ -578,7 +579,6 @@ private[spark] object JsonProtocol {
   def shuffleReadMetricsFromJson(json: JValue): ShuffleReadMetrics = {
     val metrics = new ShuffleReadMetrics
     metrics.shuffleFinishTime = (json \ "Shuffle Finish Time").extract[Long]
-    metrics.totalBlocksFetched = (json \ "Total Blocks Fetched").extract[Int]
     metrics.remoteBlocksFetched = (json \ "Remote Blocks Fetched").extract[Int]
     metrics.localBlocksFetched = (json \ "Local Blocks Fetched").extract[Int]
     metrics.fetchWaitTime = (json \ "Fetch Wait Time").extract[Long]
diff --git a/core/src/main/scala/org/apache/spark/util/SignalLogger.scala b/core/src/main/scala/org/apache/spark/util/SignalLogger.scala
index d769b54fa2fae..f77488ef3d449 100644
--- a/core/src/main/scala/org/apache/spark/util/SignalLogger.scala
+++ b/core/src/main/scala/org/apache/spark/util/SignalLogger.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.util
 
-import org.apache.commons.lang.SystemUtils
+import org.apache.commons.lang3.SystemUtils
 import org.slf4j.Logger
 import sun.misc.{Signal, SignalHandler}
 
diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index 08465575309c6..bce3b3afe9aba 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -180,7 +180,7 @@ private[spark] object SizeEstimator extends Logging {
     }
   }
 
-  // Estimat the size of arrays larger than ARRAY_SIZE_FOR_SAMPLING by sampling.
+  // Estimate the size of arrays larger than ARRAY_SIZE_FOR_SAMPLING by sampling.
   private val ARRAY_SIZE_FOR_SAMPLING = 200
   private val ARRAY_SAMPLE_SIZE = 100 // should be lower than ARRAY_SIZE_FOR_SAMPLING
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index a2454e120a8ab..30073a82857d2 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -21,7 +21,7 @@ import java.io._
 import java.net.{InetAddress, Inet4Address, NetworkInterface, URI, URL, URLConnection}
 import java.nio.ByteBuffer
 import java.util.{Locale, Random, UUID}
-import java.util.concurrent.{ConcurrentHashMap, Executors, ThreadPoolExecutor}
+import java.util.concurrent.{ThreadFactory, ConcurrentHashMap, Executors, ThreadPoolExecutor}
 
 import scala.collection.JavaConversions._
 import scala.collection.Map
@@ -38,13 +38,13 @@ import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
 import org.json4s._
 import tachyon.client.{TachyonFile,TachyonFS}
 
-import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
+import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.executor.ExecutorUncaughtExceptionHandler
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
 
 /** CallSite represents a place in user code. It can have a short and a long form. */
-private[spark] case class CallSite(val short: String, val long: String)
+private[spark] case class CallSite(shortForm: String, longForm: String)
 
 /**
  * Various utility methods used by Spark.
@@ -286,17 +286,23 @@ private[spark] object Utils extends Logging {
                  out: OutputStream,
                  closeStreams: Boolean = false)
   {
-    val buf = new Array[Byte](8192)
-    var n = 0
-    while (n != -1) {
-      n = in.read(buf)
-      if (n != -1) {
-        out.write(buf, 0, n)
+    try {
+      val buf = new Array[Byte](8192)
+      var n = 0
+      while (n != -1) {
+        n = in.read(buf)
+        if (n != -1) {
+          out.write(buf, 0, n)
+        }
+      }
+    } finally {
+      if (closeStreams) {
+        try {
+          in.close()
+        } finally {
+          out.close()
+        }
       }
-    }
-    if (closeStreams) {
-      in.close()
-      out.close()
     }
   }
 
@@ -553,19 +559,19 @@ private[spark] object Utils extends Logging {
     new ThreadFactoryBuilder().setDaemon(true)
 
   /**
-   * Wrapper over newCachedThreadPool. Thread names are formatted as prefix-ID, where ID is a
-   * unique, sequentially assigned integer.
+   * Create a thread factory that names threads with a prefix and also sets the threads to daemon.
    */
-  def newDaemonCachedThreadPool(prefix: String): ThreadPoolExecutor = {
-    val threadFactory = daemonThreadFactoryBuilder.setNameFormat(prefix + "-%d").build()
-    Executors.newCachedThreadPool(threadFactory).asInstanceOf[ThreadPoolExecutor]
+  def namedThreadFactory(prefix: String): ThreadFactory = {
+    daemonThreadFactoryBuilder.setNameFormat(prefix + "-%d").build()
   }
 
   /**
-   * Return the string to tell how long has passed in milliseconds.
+   * Wrapper over newCachedThreadPool. Thread names are formatted as prefix-ID, where ID is a
+   * unique, sequentially assigned integer.
    */
-  def getUsedTimeMs(startTimeMs: Long): String = {
-    " " + (System.currentTimeMillis - startTimeMs) + " ms"
+  def newDaemonCachedThreadPool(prefix: String): ThreadPoolExecutor = {
+    val threadFactory = namedThreadFactory(prefix)
+    Executors.newCachedThreadPool(threadFactory).asInstanceOf[ThreadPoolExecutor]
   }
 
   /**
@@ -573,10 +579,17 @@ private[spark] object Utils extends Logging {
    * unique, sequentially assigned integer.
    */
   def newDaemonFixedThreadPool(nThreads: Int, prefix: String): ThreadPoolExecutor = {
-    val threadFactory = daemonThreadFactoryBuilder.setNameFormat(prefix + "-%d").build()
+    val threadFactory = namedThreadFactory(prefix)
     Executors.newFixedThreadPool(nThreads, threadFactory).asInstanceOf[ThreadPoolExecutor]
   }
 
+  /**
+   * Return the string to tell how long has passed in milliseconds.
+   */
+  def getUsedTimeMs(startTimeMs: Long): String = {
+    " " + (System.currentTimeMillis - startTimeMs) + " ms"
+  }
+
   private def listFilesSafely(file: File): Seq[File] = {
     val files = file.listFiles()
     if (files == null) {
@@ -809,7 +822,12 @@ private[spark] object Utils extends Logging {
    */
   def getCallSite: CallSite = {
     val trace = Thread.currentThread.getStackTrace()
-      .filterNot(_.getMethodName.contains("getStackTrace"))
+      .filterNot { ste:StackTraceElement => 
+        // When running under some profilers, the current stack trace might contain some bogus 
+        // frames. This is intended to ensure that we don't crash in these situations by
+        // ignoring any frames that we can't examine.
+        (ste == null || ste.getMethodName == null || ste.getMethodName.contains("getStackTrace"))
+      }
 
     // Keep crawling up the stack trace until we find the first function not inside of the spark
     // package. We track the last (shallowest) contiguous Spark method. This might be an RDD
@@ -843,8 +861,8 @@ private[spark] object Utils extends Logging {
     }
     val callStackDepth = System.getProperty("spark.callstack.depth", "20").toInt
     CallSite(
-      short = "%s at %s:%s".format(lastSparkMethod, firstUserFile, firstUserLine),
-      long = callStack.take(callStackDepth).mkString("\n"))
+      shortForm = "%s at %s:%s".format(lastSparkMethod, firstUserFile, firstUserLine),
+      longForm = callStack.take(callStackDepth).mkString("\n"))
   }
 
   /** Return a string containing part of a file from byte 'start' to 'end'. */
@@ -856,9 +874,12 @@ private[spark] object Utils extends Logging {
     val buff = new Array[Byte]((effectiveEnd-effectiveStart).toInt)
     val stream = new FileInputStream(file)
 
-    stream.skip(effectiveStart)
-    stream.read(buff)
-    stream.close()
+    try {
+      stream.skip(effectiveStart)
+      stream.read(buff)
+    } finally {
+      stream.close()
+    }
     Source.fromBytes(buff).mkString
   }
 
@@ -1286,4 +1307,28 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /** Return a nice string representation of the exception, including the stack trace. */
+  def exceptionString(e: Exception): String = {
+    if (e == null) "" else exceptionString(getFormattedClassName(e), e.getMessage, e.getStackTrace)
+  }
+
+  /** Return a nice string representation of the exception, including the stack trace. */
+  def exceptionString(
+      className: String,
+      description: String,
+      stackTrace: Array[StackTraceElement]): String = {
+    val desc = if (description == null) "" else description
+    val st = if (stackTrace == null) "" else stackTrace.map("        " + _).mkString("\n")
+    s"$className: $desc\n$st"
+  }
+
+  /**
+   * Convert all spark properties set in the given SparkConf to a sequence of java options.
+   */
+  def sparkJavaOpts(conf: SparkConf, filterKey: (String => Boolean) = _ => true): Seq[String] = {
+    conf.getAll
+      .filter { case (k, _) => filterKey(k) }
+      .map { case (k, v) => s"-D$k=$v" }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
index 1a6f1c2b55799..290282c9c2e28 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -254,26 +254,21 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
    * Return an iterator of the map in sorted order. This provides a way to sort the map without
    * using additional memory, at the expense of destroying the validity of the map.
    */
-  def destructiveSortedIterator(cmp: Comparator[(K, V)]): Iterator[(K, V)] = {
+  def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)] = {
     destroyed = true
     // Pack KV pairs into the front of the underlying array
     var keyIndex, newIndex = 0
     while (keyIndex < capacity) {
       if (data(2 * keyIndex) != null) {
-        data(newIndex) = (data(2 * keyIndex), data(2 * keyIndex + 1))
+        data(2 * newIndex) = data(2 * keyIndex)
+        data(2 * newIndex + 1) = data(2 * keyIndex + 1)
         newIndex += 1
       }
       keyIndex += 1
     }
     assert(curSize == newIndex + (if (haveNullValue) 1 else 0))
 
-    // Sort by the given ordering
-    val rawOrdering = new Comparator[AnyRef] {
-      def compare(x: AnyRef, y: AnyRef): Int = {
-        cmp.compare(x.asInstanceOf[(K, V)], y.asInstanceOf[(K, V)])
-      }
-    }
-    Arrays.sort(data, 0, newIndex, rawOrdering)
+    new Sorter(new KVArraySortDataFormat[K, AnyRef]).sort(data, 0, newIndex, keyComparator)
 
     new Iterator[(K, V)] {
       var i = 0
@@ -284,7 +279,7 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
           nullValueReady = false
           (null.asInstanceOf[K], nullValue)
         } else {
-          val item = data(i).asInstanceOf[(K, V)]
+          val item = (data(2 * i).asInstanceOf[K], data(2 * i + 1).asInstanceOf[V])
           i += 1
           item
         }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala
new file mode 100644
index 0000000000000..d44e15e3c97ea
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+/**
+ * An append-only buffer similar to ArrayBuffer, but more memory-efficient for small buffers.
+ * ArrayBuffer always allocates an Object array to store the data, with 16 entries by default,
+ * so it has about 80-100 bytes of overhead. In contrast, CompactBuffer can keep up to two
+ * elements in fields of the main object, and only allocates an Array[AnyRef] if there are more
+ * entries than that. This makes it more efficient for operations like groupBy where we expect
+ * some keys to have very few elements.
+ */
+private[spark] class CompactBuffer[T] extends Seq[T] with Serializable {
+  // First two elements
+  private var element0: T = _
+  private var element1: T = _
+
+  // Number of elements, including our two in the main object
+  private var curSize = 0
+
+  // Array for extra elements
+  private var otherElements: Array[AnyRef] = null
+
+  def apply(position: Int): T = {
+    if (position < 0 || position >= curSize) {
+      throw new IndexOutOfBoundsException
+    }
+    if (position == 0) {
+      element0
+    } else if (position == 1) {
+      element1
+    } else {
+      otherElements(position - 2).asInstanceOf[T]
+    }
+  }
+
+  private def update(position: Int, value: T): Unit = {
+    if (position < 0 || position >= curSize) {
+      throw new IndexOutOfBoundsException
+    }
+    if (position == 0) {
+      element0 = value
+    } else if (position == 1) {
+      element1 = value
+    } else {
+      otherElements(position - 2) = value.asInstanceOf[AnyRef]
+    }
+  }
+
+  def += (value: T): CompactBuffer[T] = {
+    val newIndex = curSize
+    if (newIndex == 0) {
+      element0 = value
+      curSize = 1
+    } else if (newIndex == 1) {
+      element1 = value
+      curSize = 2
+    } else {
+      growToSize(curSize + 1)
+      otherElements(newIndex - 2) = value.asInstanceOf[AnyRef]
+    }
+    this
+  }
+
+  def ++= (values: TraversableOnce[T]): CompactBuffer[T] = {
+    values match {
+      // Optimize merging of CompactBuffers, used in cogroup and groupByKey
+      case compactBuf: CompactBuffer[T] =>
+        val oldSize = curSize
+        // Copy the other buffer's size and elements to local variables in case it is equal to us
+        val itsSize = compactBuf.curSize
+        val itsElements = compactBuf.otherElements
+        growToSize(curSize + itsSize)
+        if (itsSize == 1) {
+          this(oldSize) = compactBuf.element0
+        } else if (itsSize == 2) {
+          this(oldSize) = compactBuf.element0
+          this(oldSize + 1) = compactBuf.element1
+        } else if (itsSize > 2) {
+          this(oldSize) = compactBuf.element0
+          this(oldSize + 1) = compactBuf.element1
+          // At this point our size is also above 2, so just copy its array directly into ours.
+          // Note that since we added two elements above, the index in this.otherElements that we
+          // should copy to is oldSize.
+          System.arraycopy(itsElements, 0, otherElements, oldSize, itsSize - 2)
+        }
+
+      case _ =>
+        values.foreach(e => this += e)
+    }
+    this
+  }
+
+  override def length: Int = curSize
+
+  override def size: Int = curSize
+
+  override def iterator: Iterator[T] = new Iterator[T] {
+    private var pos = 0
+    override def hasNext: Boolean = pos < curSize
+    override def next(): T = {
+      if (!hasNext) {
+        throw new NoSuchElementException
+      }
+      pos += 1
+      apply(pos - 1)
+    }
+  }
+
+  /** Increase our size to newSize and grow the backing array if needed. */
+  private def growToSize(newSize: Int): Unit = {
+    if (newSize < 0) {
+      throw new UnsupportedOperationException("Can't grow buffer past Int.MaxValue elements")
+    }
+    val capacity = if (otherElements != null) otherElements.length + 2 else 2
+    if (newSize > capacity) {
+      var newArrayLen = 8
+      while (newSize - 2 > newArrayLen) {
+        newArrayLen *= 2
+        if (newArrayLen == Int.MinValue) {
+          // Prevent overflow if we double from 2^30 to 2^31, which will become Int.MinValue.
+          // Note that we set the new array length to Int.MaxValue - 2 so that our capacity
+          // calculation above still gives a positive integer.
+          newArrayLen = Int.MaxValue - 2
+        }
+      }
+      val newArray = new Array[AnyRef](newArrayLen)
+      if (otherElements != null) {
+        System.arraycopy(otherElements, 0, newArray, 0, otherElements.length)
+      }
+      otherElements = newArray
+    }
+    curSize = newSize
+  }
+}
+
+private[spark] object CompactBuffer {
+  def apply[T](): CompactBuffer[T] = new CompactBuffer[T]
+
+  def apply[T](value: T): CompactBuffer[T] = {
+    val buf = new CompactBuffer[T]
+    buf += value
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 292d0962f4fdb..cb67a1c039f20 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -30,6 +30,7 @@ import org.apache.spark.{Logging, SparkEnv}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.storage.{BlockId, BlockManager}
+import org.apache.spark.util.collection.ExternalAppendOnlyMap.HashComparator
 
 /**
  * :: DeveloperApi ::
@@ -66,8 +67,6 @@ class ExternalAppendOnlyMap[K, V, C](
     blockManager: BlockManager = SparkEnv.get.blockManager)
   extends Iterable[(K, C)] with Serializable with Logging {
 
-  import ExternalAppendOnlyMap._
-
   private var currentMap = new SizeTrackingAppendOnlyMap[K, C]
   private val spilledMaps = new ArrayBuffer[DiskMapIterator]
   private val sparkConf = SparkEnv.get.conf
@@ -75,17 +74,21 @@ class ExternalAppendOnlyMap[K, V, C](
 
   // Collective memory threshold shared across all running tasks
   private val maxMemoryThreshold = {
-    val memoryFraction = sparkConf.getDouble("spark.shuffle.memoryFraction", 0.3)
+    val memoryFraction = sparkConf.getDouble("spark.shuffle.memoryFraction", 0.2)
     val safetyFraction = sparkConf.getDouble("spark.shuffle.safetyFraction", 0.8)
     (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
   }
 
-  // Number of pairs in the in-memory map
-  private var numPairsInMemory = 0L
+  // Number of pairs inserted since last spill; note that we count them even if a value is merged
+  // with a previous key in case we're doing something like groupBy where the result grows
+  private var elementsRead = 0L
 
   // Number of in-memory pairs inserted before tracking the map's shuffle memory usage
   private val trackMemoryThreshold = 1000
 
+  // How much of the shared memory pool this collection has claimed
+  private var myMemoryThreshold = 0L
+
   /**
    * Size of object batches when reading/writing from serializers.
    *
@@ -105,57 +108,88 @@ class ExternalAppendOnlyMap[K, V, C](
   private var _diskBytesSpilled = 0L
 
   private val fileBufferSize = sparkConf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
-  private val comparator = new KCComparator[K, C]
+  private val keyComparator = new HashComparator[K]
   private val ser = serializer.newInstance()
 
   /**
    * Insert the given key and value into the map.
+   */
+  def insert(key: K, value: V): Unit = {
+    insertAll(Iterator((key, value)))
+  }
+
+  /**
+   * Insert the given iterator of keys and values into the map.
    *
-   * If the underlying map is about to grow, check if the global pool of shuffle memory has
+   * When the underlying map needs to grow, check if the global pool of shuffle memory has
    * enough room for this to happen. If so, allocate the memory required to grow the map;
    * otherwise, spill the in-memory map to disk.
    *
    * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked.
    */
-  def insert(key: K, value: V) {
+  def insertAll(entries: Iterator[Product2[K, V]]): Unit = {
+    // An update function for the map that we reuse across entries to avoid allocating
+    // a new closure each time
+    var curEntry: Product2[K, V] = null
     val update: (Boolean, C) => C = (hadVal, oldVal) => {
-      if (hadVal) mergeValue(oldVal, value) else createCombiner(value)
+      if (hadVal) mergeValue(oldVal, curEntry._2) else createCombiner(curEntry._2)
     }
-    if (numPairsInMemory > trackMemoryThreshold && currentMap.atGrowThreshold) {
-      val mapSize = currentMap.estimateSize()
-      var shouldSpill = false
-      val shuffleMemoryMap = SparkEnv.get.shuffleMemoryMap
-
-      // Atomically check whether there is sufficient memory in the global pool for
-      // this map to grow and, if possible, allocate the required amount
-      shuffleMemoryMap.synchronized {
-        val threadId = Thread.currentThread().getId
-        val previouslyOccupiedMemory = shuffleMemoryMap.get(threadId)
-        val availableMemory = maxMemoryThreshold -
-          (shuffleMemoryMap.values.sum - previouslyOccupiedMemory.getOrElse(0L))
-
-        // Assume map growth factor is 2x
-        shouldSpill = availableMemory < mapSize * 2
-        if (!shouldSpill) {
-          shuffleMemoryMap(threadId) = mapSize * 2
+
+    while (entries.hasNext) {
+      curEntry = entries.next()
+      if (elementsRead > trackMemoryThreshold && elementsRead % 32 == 0 &&
+          currentMap.estimateSize() >= myMemoryThreshold)
+      {
+        val currentSize = currentMap.estimateSize()
+        var shouldSpill = false
+        val shuffleMemoryMap = SparkEnv.get.shuffleMemoryMap
+
+        // Atomically check whether there is sufficient memory in the global pool for
+        // this map to grow and, if possible, allocate the required amount
+        shuffleMemoryMap.synchronized {
+          val threadId = Thread.currentThread().getId
+          val previouslyOccupiedMemory = shuffleMemoryMap.get(threadId)
+          val availableMemory = maxMemoryThreshold -
+            (shuffleMemoryMap.values.sum - previouslyOccupiedMemory.getOrElse(0L))
+
+          // Try to allocate at least 2x more memory, otherwise spill
+          shouldSpill = availableMemory < currentSize * 2
+          if (!shouldSpill) {
+            shuffleMemoryMap(threadId) = currentSize * 2
+            myMemoryThreshold = currentSize * 2
+          }
+        }
+        // Do not synchronize spills
+        if (shouldSpill) {
+          spill(currentSize)
         }
       }
-      // Do not synchronize spills
-      if (shouldSpill) {
-        spill(mapSize)
-      }
+      currentMap.changeValue(curEntry._1, update)
+      elementsRead += 1
     }
-    currentMap.changeValue(key, update)
-    numPairsInMemory += 1
+  }
+
+  /**
+   * Insert the given iterable of keys and values into the map.
+   *
+   * When the underlying map needs to grow, check if the global pool of shuffle memory has
+   * enough room for this to happen. If so, allocate the memory required to grow the map;
+   * otherwise, spill the in-memory map to disk.
+   *
+   * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked.
+   */
+  def insertAll(entries: Iterable[Product2[K, V]]): Unit = {
+    insertAll(entries.iterator)
   }
 
   /**
    * Sort the existing contents of the in-memory map and spill them to a temporary file on disk.
    */
-  private def spill(mapSize: Long) {
+  private def spill(mapSize: Long): Unit = {
     spillCount += 1
-    logWarning("Spilling in-memory map of %d MB to disk (%d time%s so far)"
-      .format(mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
+    val threadId = Thread.currentThread().getId
+    logInfo("Thread %d spilling in-memory map of %d MB to disk (%d time%s so far)"
+      .format(threadId, mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
     val (blockId, file) = diskBlockManager.createTempBlock()
     var writer = blockManager.getDiskWriter(blockId, file, serializer, fileBufferSize)
     var objectsWritten = 0
@@ -165,7 +199,7 @@ class ExternalAppendOnlyMap[K, V, C](
 
     // Flush the disk writer's contents to disk, and update relevant variables
     def flush() = {
-      writer.commit()
+      writer.commitAndClose()
       val bytesWritten = writer.bytesWritten
       batchSizes.append(bytesWritten)
       _diskBytesSpilled += bytesWritten
@@ -173,7 +207,7 @@ class ExternalAppendOnlyMap[K, V, C](
     }
 
     try {
-      val it = currentMap.destructiveSortedIterator(comparator)
+      val it = currentMap.destructiveSortedIterator(keyComparator)
       while (it.hasNext) {
         val kv = it.next()
         writer.write(kv)
@@ -201,7 +235,9 @@ class ExternalAppendOnlyMap[K, V, C](
     shuffleMemoryMap.synchronized {
       shuffleMemoryMap(Thread.currentThread().getId) = 0
     }
-    numPairsInMemory = 0
+    myMemoryThreshold = 0
+
+    elementsRead = 0
     _memoryBytesSpilled += mapSize
   }
 
@@ -231,7 +267,7 @@ class ExternalAppendOnlyMap[K, V, C](
 
     // Input streams are derived both from the in-memory map and spilled maps on disk
     // The in-memory map is sorted in place, while the spilled maps are already in sorted order
-    private val sortedMap = currentMap.destructiveSortedIterator(comparator)
+    private val sortedMap = currentMap.destructiveSortedIterator(keyComparator)
     private val inputStreams = (Seq(sortedMap) ++ spilledMaps).map(it => it.buffered)
 
     inputStreams.foreach { it =>
@@ -252,7 +288,7 @@ class ExternalAppendOnlyMap[K, V, C](
       if (it.hasNext) {
         var kc = it.next()
         kcPairs += kc
-        val minHash = getKeyHashCode(kc)
+        val minHash = hashKey(kc)
         while (it.hasNext && it.head._1.hashCode() == minHash) {
           kc = it.next()
           kcPairs += kc
@@ -268,10 +304,10 @@ class ExternalAppendOnlyMap[K, V, C](
     private def mergeIfKeyExists(key: K, baseCombiner: C, buffer: StreamBuffer): C = {
       var i = 0
       while (i < buffer.pairs.length) {
-        val (k, c) = buffer.pairs(i)
-        if (k == key) {
+        val pair = buffer.pairs(i)
+        if (pair._1 == key) {
           buffer.pairs.remove(i)
-          return mergeCombiners(baseCombiner, c)
+          return mergeCombiners(baseCombiner, pair._2)
         }
         i += 1
       }
@@ -293,10 +329,12 @@ class ExternalAppendOnlyMap[K, V, C](
       }
       // Select a key from the StreamBuffer that holds the lowest key hash
       val minBuffer = mergeHeap.dequeue()
-      val (minPairs, minHash) = (minBuffer.pairs, minBuffer.minKeyHash)
+      val minPairs = minBuffer.pairs
+      val minHash = minBuffer.minKeyHash
       val minPair = minPairs.remove(0)
-      var (minKey, minCombiner) = minPair
-      assert(getKeyHashCode(minPair) == minHash)
+      val minKey = minPair._1
+      var minCombiner = minPair._2
+      assert(hashKey(minPair) == minHash)
 
       // For all other streams that may have this key (i.e. have the same minimum key hash),
       // merge in the corresponding value (if any) from that stream
@@ -337,7 +375,7 @@ class ExternalAppendOnlyMap[K, V, C](
       // Invalid if there are no more pairs in this stream
       def minKeyHash: Int = {
         assert(pairs.length > 0)
-        getKeyHashCode(pairs.head)
+        hashKey(pairs.head)
       }
 
       override def compareTo(other: StreamBuffer): Int = {
@@ -421,25 +459,27 @@ class ExternalAppendOnlyMap[K, V, C](
       file.delete()
     }
   }
+
+  /** Convenience function to hash the given (K, C) pair by the key. */
+  private def hashKey(kc: (K, C)): Int = ExternalAppendOnlyMap.hash(kc._1)
 }
 
 private[spark] object ExternalAppendOnlyMap {
 
   /**
-   * Return the key hash code of the given (key, combiner) pair.
-   * If the key is null, return a special hash code.
+   * Return the hash code of the given object. If the object is null, return a special hash code.
    */
-  private def getKeyHashCode[K, C](kc: (K, C)): Int = {
-    if (kc._1 == null) 0 else kc._1.hashCode()
+  private def hash[T](obj: T): Int = {
+    if (obj == null) 0 else obj.hashCode()
   }
 
   /**
-   * A comparator for (key, combiner) pairs based on their key hash codes.
+   * A comparator which sorts arbitrary keys based on their hash codes.
    */
-  private class KCComparator[K, C] extends Comparator[(K, C)] {
-    def compare(kc1: (K, C), kc2: (K, C)): Int = {
-      val hash1 = getKeyHashCode(kc1)
-      val hash2 = getKeyHashCode(kc2)
+  private class HashComparator[K] extends Comparator[K] {
+    def compare(key1: K, key2: K): Int = {
+      val hash1 = hash(key1)
+      val hash2 = hash(key2)
       if (hash1 < hash2) -1 else if (hash1 == hash2) 0 else 1
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
new file mode 100644
index 0000000000000..6e415a2bd8ce2
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -0,0 +1,662 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import java.io._
+import java.util.Comparator
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable
+
+import com.google.common.io.ByteStreams
+
+import org.apache.spark.{Aggregator, SparkEnv, Logging, Partitioner}
+import org.apache.spark.serializer.Serializer
+import org.apache.spark.storage.BlockId
+
+/**
+ * Sorts and potentially merges a number of key-value pairs of type (K, V) to produce key-combiner
+ * pairs of type (K, C). Uses a Partitioner to first group the keys into partitions, and then
+ * optionally sorts keys within each partition using a custom Comparator. Can output a single
+ * partitioned file with a different byte range for each partition, suitable for shuffle fetches.
+ *
+ * If combining is disabled, the type C must equal V -- we'll cast the objects at the end.
+ *
+ * @param aggregator optional Aggregator with combine functions to use for merging data
+ * @param partitioner optional Partitioner; if given, sort by partition ID and then key
+ * @param ordering optional Ordering to sort keys within each partition; should be a total ordering
+ * @param serializer serializer to use when spilling to disk
+ *
+ * Note that if an Ordering is given, we'll always sort using it, so only provide it if you really
+ * want the output keys to be sorted. In a map task without map-side combine for example, you
+ * probably want to pass None as the ordering to avoid extra sorting. On the other hand, if you do
+ * want to do combining, having an Ordering is more efficient than not having it.
+ *
+ * At a high level, this class works as follows:
+ *
+ * - We repeatedly fill up buffers of in-memory data, using either a SizeTrackingAppendOnlyMap if
+ *   we want to combine by key, or an simple SizeTrackingBuffer if we don't. Inside these buffers,
+ *   we sort elements of type ((Int, K), C) where the Int is the partition ID. This is done to
+ *   avoid calling the partitioner multiple times on the same key (e.g. for RangePartitioner).
+ *
+ * - When each buffer reaches our memory limit, we spill it to a file. This file is sorted first
+ *   by partition ID and possibly second by key or by hash code of the key, if we want to do
+ *   aggregation. For each file, we track how many objects were in each partition in memory, so we
+ *   don't have to write out the partition ID for every element.
+ *
+ * - When the user requests an iterator, the spilled files are merged, along with any remaining
+ *   in-memory data, using the same sort order defined above (unless both sorting and aggregation
+ *   are disabled). If we need to aggregate by key, we either use a total ordering from the
+ *   ordering parameter, or read the keys with the same hash code and compare them with each other
+ *   for equality to merge values.
+ *
+ * - Users are expected to call stop() at the end to delete all the intermediate files.
+ */
+private[spark] class ExternalSorter[K, V, C](
+    aggregator: Option[Aggregator[K, V, C]] = None,
+    partitioner: Option[Partitioner] = None,
+    ordering: Option[Ordering[K]] = None,
+    serializer: Option[Serializer] = None) extends Logging {
+
+  private val numPartitions = partitioner.map(_.numPartitions).getOrElse(1)
+  private val shouldPartition = numPartitions > 1
+
+  private val blockManager = SparkEnv.get.blockManager
+  private val diskBlockManager = blockManager.diskBlockManager
+  private val ser = Serializer.getSerializer(serializer)
+  private val serInstance = ser.newInstance()
+
+  private val conf = SparkEnv.get.conf
+  private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
+  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
+
+  // Size of object batches when reading/writing from serializers.
+  //
+  // Objects are written in batches, with each batch using its own serialization stream. This
+  // cuts down on the size of reference-tracking maps constructed when deserializing a stream.
+  //
+  // NOTE: Setting this too low can cause excessive copying when serializing, since some serializers
+  // grow internal data structures by growing + copying every time the number of objects doubles.
+  private val serializerBatchSize = conf.getLong("spark.shuffle.spill.batchSize", 10000)
+
+  private def getPartition(key: K): Int = {
+    if (shouldPartition) partitioner.get.getPartition(key) else 0
+  }
+
+  // Data structures to store in-memory objects before we spill. Depending on whether we have an
+  // Aggregator set, we either put objects into an AppendOnlyMap where we combine them, or we
+  // store them in an array buffer.
+  private var map = new SizeTrackingAppendOnlyMap[(Int, K), C]
+  private var buffer = new SizeTrackingPairBuffer[(Int, K), C]
+
+  // Number of pairs read from input since last spill; note that we count them even if a value is
+  // merged with a previous key in case we're doing something like groupBy where the result grows
+  private var elementsRead = 0L
+
+  // What threshold of elementsRead we start estimating map size at.
+  private val trackMemoryThreshold = 1000
+
+  // Spilling statistics
+  private var spillCount = 0
+  private var _memoryBytesSpilled = 0L
+  private var _diskBytesSpilled = 0L
+
+  // Collective memory threshold shared across all running tasks
+  private val maxMemoryThreshold = {
+    val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
+    val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
+    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+  }
+
+  // How much of the shared memory pool this collection has claimed
+  private var myMemoryThreshold = 0L
+
+  // A comparator for keys K that orders them within a partition to allow aggregation or sorting.
+  // Can be a partial ordering by hash code if a total ordering is not provided through by the
+  // user. (A partial ordering means that equal keys have comparator.compare(k, k) = 0, but some
+  // non-equal keys also have this, so we need to do a later pass to find truly equal keys).
+  // Note that we ignore this if no aggregator and no ordering are given.
+  private val keyComparator: Comparator[K] = ordering.getOrElse(new Comparator[K] {
+    override def compare(a: K, b: K): Int = {
+      val h1 = if (a == null) 0 else a.hashCode()
+      val h2 = if (b == null) 0 else b.hashCode()
+      h1 - h2
+    }
+  })
+
+  // A comparator for (Int, K) elements that orders them by partition and then possibly by key
+  private val partitionKeyComparator: Comparator[(Int, K)] = {
+    if (ordering.isDefined || aggregator.isDefined) {
+      // Sort by partition ID then key comparator
+      new Comparator[(Int, K)] {
+        override def compare(a: (Int, K), b: (Int, K)): Int = {
+          val partitionDiff = a._1 - b._1
+          if (partitionDiff != 0) {
+            partitionDiff
+          } else {
+            keyComparator.compare(a._2, b._2)
+          }
+        }
+      }
+    } else {
+      // Just sort it by partition ID
+      new Comparator[(Int, K)] {
+        override def compare(a: (Int, K), b: (Int, K)): Int = {
+          a._1 - b._1
+        }
+      }
+    }
+  }
+
+  // Information about a spilled file. Includes sizes in bytes of "batches" written by the
+  // serializer as we periodically reset its stream, as well as number of elements in each
+  // partition, used to efficiently keep track of partitions when merging.
+  private[this] case class SpilledFile(
+    file: File,
+    blockId: BlockId,
+    serializerBatchSizes: Array[Long],
+    elementsPerPartition: Array[Long])
+  private val spills = new ArrayBuffer[SpilledFile]
+
+  def write(records: Iterator[_ <: Product2[K, V]]): Unit = {
+    // TODO: stop combining if we find that the reduction factor isn't high
+    val shouldCombine = aggregator.isDefined
+
+    if (shouldCombine) {
+      // Combine values in-memory first using our AppendOnlyMap
+      val mergeValue = aggregator.get.mergeValue
+      val createCombiner = aggregator.get.createCombiner
+      var kv: Product2[K, V] = null
+      val update = (hadValue: Boolean, oldValue: C) => {
+        if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
+      }
+      while (records.hasNext) {
+        elementsRead += 1
+        kv = records.next()
+        map.changeValue((getPartition(kv._1), kv._1), update)
+        maybeSpill(usingMap = true)
+      }
+    } else {
+      // Stick values into our buffer
+      while (records.hasNext) {
+        elementsRead += 1
+        val kv = records.next()
+        buffer.insert((getPartition(kv._1), kv._1), kv._2.asInstanceOf[C])
+        maybeSpill(usingMap = false)
+      }
+    }
+  }
+
+  /**
+   * Spill the current in-memory collection to disk if needed.
+   *
+   * @param usingMap whether we're using a map or buffer as our current in-memory collection
+   */
+  private def maybeSpill(usingMap: Boolean): Unit = {
+    if (!spillingEnabled) {
+      return
+    }
+
+    val collection: SizeTrackingPairCollection[(Int, K), C] = if (usingMap) map else buffer
+
+    // TODO: factor this out of both here and ExternalAppendOnlyMap
+    if (elementsRead > trackMemoryThreshold && elementsRead % 32 == 0 &&
+        collection.estimateSize() >= myMemoryThreshold)
+    {
+      // TODO: This logic doesn't work if there are two external collections being used in the same
+      // task (e.g. to read shuffle output and write it out into another shuffle) [SPARK-2711]
+
+      val currentSize = collection.estimateSize()
+      var shouldSpill = false
+      val shuffleMemoryMap = SparkEnv.get.shuffleMemoryMap
+
+      // Atomically check whether there is sufficient memory in the global pool for
+      // us to double our threshold
+      shuffleMemoryMap.synchronized {
+        val threadId = Thread.currentThread().getId
+        val previouslyClaimedMemory = shuffleMemoryMap.get(threadId)
+        val availableMemory = maxMemoryThreshold -
+          (shuffleMemoryMap.values.sum - previouslyClaimedMemory.getOrElse(0L))
+
+        // Try to allocate at least 2x more memory, otherwise spill
+        shouldSpill = availableMemory < currentSize * 2
+        if (!shouldSpill) {
+          shuffleMemoryMap(threadId) = currentSize * 2
+          myMemoryThreshold = currentSize * 2
+        }
+      }
+      // Do not hold lock during spills
+      if (shouldSpill) {
+        spill(currentSize, usingMap)
+      }
+    }
+  }
+
+  /**
+   * Spill the current in-memory collection to disk, adding a new file to spills, and clear it.
+   *
+   * @param usingMap whether we're using a map or buffer as our current in-memory collection
+   */
+  private def spill(memorySize: Long, usingMap: Boolean): Unit = {
+    val collection: SizeTrackingPairCollection[(Int, K), C] = if (usingMap) map else buffer
+    val memorySize = collection.estimateSize()
+
+    spillCount += 1
+    val threadId = Thread.currentThread().getId
+    logInfo("Thread %d spilling in-memory batch of %d MB to disk (%d spill%s so far)"
+      .format(threadId, memorySize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
+    val (blockId, file) = diskBlockManager.createTempBlock()
+    var writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSize)
+    var objectsWritten = 0   // Objects written since the last flush
+
+    // List of batch sizes (bytes) in the order they are written to disk
+    val batchSizes = new ArrayBuffer[Long]
+
+    // How many elements we have in each partition
+    val elementsPerPartition = new Array[Long](numPartitions)
+
+    // Flush the disk writer's contents to disk, and update relevant variables.
+    // The writer is closed at the end of this process, and cannot be reused.
+    def flush() = {
+      writer.commitAndClose()
+      val bytesWritten = writer.bytesWritten
+      batchSizes.append(bytesWritten)
+      _diskBytesSpilled += bytesWritten
+      objectsWritten = 0
+    }
+
+    try {
+      val it = collection.destructiveSortedIterator(partitionKeyComparator)
+      while (it.hasNext) {
+        val elem = it.next()
+        val partitionId = elem._1._1
+        val key = elem._1._2
+        val value = elem._2
+        writer.write(key)
+        writer.write(value)
+        elementsPerPartition(partitionId) += 1
+        objectsWritten += 1
+
+        if (objectsWritten == serializerBatchSize) {
+          flush()
+          writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSize)
+        }
+      }
+      if (objectsWritten > 0) {
+        flush()
+      }
+      writer.close()
+    } catch {
+      case e: Exception =>
+        writer.close()
+        file.delete()
+        throw e
+    }
+
+    if (usingMap) {
+      map = new SizeTrackingAppendOnlyMap[(Int, K), C]
+    } else {
+      buffer = new SizeTrackingPairBuffer[(Int, K), C]
+    }
+
+    // Reset the amount of shuffle memory used by this map in the global pool
+    val shuffleMemoryMap = SparkEnv.get.shuffleMemoryMap
+    shuffleMemoryMap.synchronized {
+      shuffleMemoryMap(Thread.currentThread().getId) = 0
+    }
+    myMemoryThreshold = 0
+
+    spills.append(SpilledFile(file, blockId, batchSizes.toArray, elementsPerPartition))
+    _memoryBytesSpilled += memorySize
+  }
+
+  /**
+   * Merge a sequence of sorted files, giving an iterator over partitions and then over elements
+   * inside each partition. This can be used to either write out a new file or return data to
+   * the user.
+   *
+   * Returns an iterator over all the data written to this object, grouped by partition. For each
+   * partition we then have an iterator over its contents, and these are expected to be accessed
+   * in order (you can't "skip ahead" to one partition without reading the previous one).
+   * Guaranteed to return a key-value pair for each partition, in order of partition ID.
+   */
+  private def merge(spills: Seq[SpilledFile], inMemory: Iterator[((Int, K), C)])
+      : Iterator[(Int, Iterator[Product2[K, C]])] = {
+    val readers = spills.map(new SpillReader(_))
+    val inMemBuffered = inMemory.buffered
+    (0 until numPartitions).iterator.map { p =>
+      val inMemIterator = new IteratorForPartition(p, inMemBuffered)
+      val iterators = readers.map(_.readNextPartition()) ++ Seq(inMemIterator)
+      if (aggregator.isDefined) {
+        // Perform partial aggregation across partitions
+        (p, mergeWithAggregation(
+          iterators, aggregator.get.mergeCombiners, keyComparator, ordering.isDefined))
+      } else if (ordering.isDefined) {
+        // No aggregator given, but we have an ordering (e.g. used by reduce tasks in sortByKey);
+        // sort the elements without trying to merge them
+        (p, mergeSort(iterators, ordering.get))
+      } else {
+        (p, iterators.iterator.flatten)
+      }
+    }
+  }
+
+  /**
+   * Merge-sort a sequence of (K, C) iterators using a given a comparator for the keys.
+   */
+  private def mergeSort(iterators: Seq[Iterator[Product2[K, C]]], comparator: Comparator[K])
+      : Iterator[Product2[K, C]] =
+  {
+    val bufferedIters = iterators.filter(_.hasNext).map(_.buffered)
+    type Iter = BufferedIterator[Product2[K, C]]
+    val heap = new mutable.PriorityQueue[Iter]()(new Ordering[Iter] {
+      // Use the reverse of comparator.compare because PriorityQueue dequeues the max
+      override def compare(x: Iter, y: Iter): Int = -comparator.compare(x.head._1, y.head._1)
+    })
+    heap.enqueue(bufferedIters: _*)  // Will contain only the iterators with hasNext = true
+    new Iterator[Product2[K, C]] {
+      override def hasNext: Boolean = !heap.isEmpty
+
+      override def next(): Product2[K, C] = {
+        if (!hasNext) {
+          throw new NoSuchElementException
+        }
+        val firstBuf = heap.dequeue()
+        val firstPair = firstBuf.next()
+        if (firstBuf.hasNext) {
+          heap.enqueue(firstBuf)
+        }
+        firstPair
+      }
+    }
+  }
+
+  /**
+   * Merge a sequence of (K, C) iterators by aggregating values for each key, assuming that each
+   * iterator is sorted by key with a given comparator. If the comparator is not a total ordering
+   * (e.g. when we sort objects by hash code and different keys may compare as equal although
+   * they're not), we still merge them by doing equality tests for all keys that compare as equal.
+   */
+  private def mergeWithAggregation(
+      iterators: Seq[Iterator[Product2[K, C]]],
+      mergeCombiners: (C, C) => C,
+      comparator: Comparator[K],
+      totalOrder: Boolean)
+      : Iterator[Product2[K, C]] =
+  {
+    if (!totalOrder) {
+      // We only have a partial ordering, e.g. comparing the keys by hash code, which means that
+      // multiple distinct keys might be treated as equal by the ordering. To deal with this, we
+      // need to read all keys considered equal by the ordering at once and compare them.
+      new Iterator[Iterator[Product2[K, C]]] {
+        val sorted = mergeSort(iterators, comparator).buffered
+
+        // Buffers reused across elements to decrease memory allocation
+        val keys = new ArrayBuffer[K]
+        val combiners = new ArrayBuffer[C]
+
+        override def hasNext: Boolean = sorted.hasNext
+
+        override def next(): Iterator[Product2[K, C]] = {
+          if (!hasNext) {
+            throw new NoSuchElementException
+          }
+          keys.clear()
+          combiners.clear()
+          val firstPair = sorted.next()
+          keys += firstPair._1
+          combiners += firstPair._2
+          val key = firstPair._1
+          while (sorted.hasNext && comparator.compare(sorted.head._1, key) == 0) {
+            val pair = sorted.next()
+            var i = 0
+            var foundKey = false
+            while (i < keys.size && !foundKey) {
+              if (keys(i) == pair._1) {
+                combiners(i) = mergeCombiners(combiners(i), pair._2)
+                foundKey = true
+              }
+              i += 1
+            }
+            if (!foundKey) {
+              keys += pair._1
+              combiners += pair._2
+            }
+          }
+
+          // Note that we return an iterator of elements since we could've had many keys marked
+          // equal by the partial order; we flatten this below to get a flat iterator of (K, C).
+          keys.iterator.zip(combiners.iterator)
+        }
+      }.flatMap(i => i)
+    } else {
+      // We have a total ordering, so the objects with the same key are sequential.
+      new Iterator[Product2[K, C]] {
+        val sorted = mergeSort(iterators, comparator).buffered
+
+        override def hasNext: Boolean = sorted.hasNext
+
+        override def next(): Product2[K, C] = {
+          if (!hasNext) {
+            throw new NoSuchElementException
+          }
+          val elem = sorted.next()
+          val k = elem._1
+          var c = elem._2
+          while (sorted.hasNext && sorted.head._1 == k) {
+            c = mergeCombiners(c, sorted.head._2)
+          }
+          (k, c)
+        }
+      }
+    }
+  }
+
+  /**
+   * An internal class for reading a spilled file partition by partition. Expects all the
+   * partitions to be requested in order.
+   */
+  private[this] class SpillReader(spill: SpilledFile) {
+    val fileStream = new FileInputStream(spill.file)
+    val bufferedStream = new BufferedInputStream(fileStream, fileBufferSize)
+
+    // Track which partition and which batch stream we're in. These will be the indices of
+    // the next element we will read. We'll also store the last partition read so that
+    // readNextPartition() can figure out what partition that was from.
+    var partitionId = 0
+    var indexInPartition = 0L
+    var batchStreamsRead = 0
+    var indexInBatch = 0
+    var lastPartitionId = 0
+
+    skipToNextPartition()
+
+    // An intermediate stream that reads from exactly one batch
+    // This guards against pre-fetching and other arbitrary behavior of higher level streams
+    var batchStream = nextBatchStream()
+    var compressedStream = blockManager.wrapForCompression(spill.blockId, batchStream)
+    var deserStream = serInstance.deserializeStream(compressedStream)
+    var nextItem: (K, C) = null
+    var finished = false
+
+    /** Construct a stream that only reads from the next batch */
+    def nextBatchStream(): InputStream = {
+      if (batchStreamsRead < spill.serializerBatchSizes.length) {
+        batchStreamsRead += 1
+        ByteStreams.limit(bufferedStream, spill.serializerBatchSizes(batchStreamsRead - 1))
+      } else {
+        // No more batches left; give an empty stream
+        bufferedStream
+      }
+    }
+
+    /**
+     * Update partitionId if we have reached the end of our current partition, possibly skipping
+     * empty partitions on the way.
+     */
+    private def skipToNextPartition() {
+      while (partitionId < numPartitions &&
+          indexInPartition == spill.elementsPerPartition(partitionId)) {
+        partitionId += 1
+        indexInPartition = 0L
+      }
+    }
+
+    /**
+     * Return the next (K, C) pair from the deserialization stream and update partitionId,
+     * indexInPartition, indexInBatch and such to match its location.
+     *
+     * If the current batch is drained, construct a stream for the next batch and read from it.
+     * If no more pairs are left, return null.
+     */
+    private def readNextItem(): (K, C) = {
+      if (finished) {
+        return null
+      }
+      val k = deserStream.readObject().asInstanceOf[K]
+      val c = deserStream.readObject().asInstanceOf[C]
+      lastPartitionId = partitionId
+      // Start reading the next batch if we're done with this one
+      indexInBatch += 1
+      if (indexInBatch == serializerBatchSize) {
+        batchStream = nextBatchStream()
+        compressedStream = blockManager.wrapForCompression(spill.blockId, batchStream)
+        deserStream = serInstance.deserializeStream(compressedStream)
+        indexInBatch = 0
+      }
+      // Update the partition location of the element we're reading
+      indexInPartition += 1
+      skipToNextPartition()
+      // If we've finished reading the last partition, remember that we're done
+      if (partitionId == numPartitions) {
+        finished = true
+        deserStream.close()
+      }
+      (k, c)
+    }
+
+    var nextPartitionToRead = 0
+
+    def readNextPartition(): Iterator[Product2[K, C]] = new Iterator[Product2[K, C]] {
+      val myPartition = nextPartitionToRead
+      nextPartitionToRead += 1
+
+      override def hasNext: Boolean = {
+        if (nextItem == null) {
+          nextItem = readNextItem()
+          if (nextItem == null) {
+            return false
+          }
+        }
+        assert(lastPartitionId >= myPartition)
+        // Check that we're still in the right partition; note that readNextItem will have returned
+        // null at EOF above so we would've returned false there
+        lastPartitionId == myPartition
+      }
+
+      override def next(): Product2[K, C] = {
+        if (!hasNext) {
+          throw new NoSuchElementException
+        }
+        val item = nextItem
+        nextItem = null
+        item
+      }
+    }
+  }
+
+  /**
+   * Return an iterator over all the data written to this object, grouped by partition and
+   * aggregated by the requested aggregator. For each partition we then have an iterator over its
+   * contents, and these are expected to be accessed in order (you can't "skip ahead" to one
+   * partition without reading the previous one). Guaranteed to return a key-value pair for each
+   * partition, in order of partition ID.
+   *
+   * For now, we just merge all the spilled files in once pass, but this can be modified to
+   * support hierarchical merging.
+   */
+  def partitionedIterator: Iterator[(Int, Iterator[Product2[K, C]])] = {
+    val usingMap = aggregator.isDefined
+    val collection: SizeTrackingPairCollection[(Int, K), C] = if (usingMap) map else buffer
+    if (spills.isEmpty) {
+      // Special case: if we have only in-memory data, we don't need to merge streams, and perhaps
+      // we don't even need to sort by anything other than partition ID
+      if (!ordering.isDefined) {
+        // The user isn't requested sorted keys, so only sort by partition ID, not key
+        val partitionComparator = new Comparator[(Int, K)] {
+          override def compare(a: (Int, K), b: (Int, K)): Int = {
+            a._1 - b._1
+          }
+        }
+        groupByPartition(collection.destructiveSortedIterator(partitionComparator))
+      } else {
+        // We do need to sort by both partition ID and key
+        groupByPartition(collection.destructiveSortedIterator(partitionKeyComparator))
+      }
+    } else {
+      // General case: merge spilled and in-memory data
+      merge(spills, collection.destructiveSortedIterator(partitionKeyComparator))
+    }
+  }
+
+  /**
+   * Return an iterator over all the data written to this object, aggregated by our aggregator.
+   */
+  def iterator: Iterator[Product2[K, C]] = partitionedIterator.flatMap(pair => pair._2)
+
+  def stop(): Unit = {
+    spills.foreach(s => s.file.delete())
+    spills.clear()
+  }
+
+  def memoryBytesSpilled: Long = _memoryBytesSpilled
+
+  def diskBytesSpilled: Long = _diskBytesSpilled
+
+  /**
+   * Given a stream of ((partition, key), combiner) pairs *assumed to be sorted by partition ID*,
+   * group together the pairs for each partition into a sub-iterator.
+   *
+   * @param data an iterator of elements, assumed to already be sorted by partition ID
+   */
+  private def groupByPartition(data: Iterator[((Int, K), C)])
+      : Iterator[(Int, Iterator[Product2[K, C]])] =
+  {
+    val buffered = data.buffered
+    (0 until numPartitions).iterator.map(p => (p, new IteratorForPartition(p, buffered)))
+  }
+
+  /**
+   * An iterator that reads only the elements for a given partition ID from an underlying buffered
+   * stream, assuming this partition is the next one to be read. Used to make it easier to return
+   * partitioned iterators from our in-memory collection.
+   */
+  private[this] class IteratorForPartition(partitionId: Int, data: BufferedIterator[((Int, K), C)])
+    extends Iterator[Product2[K, C]]
+  {
+    override def hasNext: Boolean = data.hasNext && data.head._1._1 == partitionId
+
+    override def next(): Product2[K, C] = {
+      if (!hasNext) {
+        throw new NoSuchElementException
+      }
+      val elem = data.next()
+      (elem._1._2, elem._2)
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PrimitiveVector.scala b/core/src/main/scala/org/apache/spark/util/collection/PrimitiveVector.scala
index b84eb65c62bc7..7e76d060d6000 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PrimitiveVector.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PrimitiveVector.scala
@@ -36,7 +36,7 @@ class PrimitiveVector[@specialized(Long, Int, Double) V: ClassTag](initialSize:
     _array(index)
   }
 
-  def +=(value: V) {
+  def +=(value: V): Unit = {
     if (_numElements == _array.length) {
       resize(_array.length * 2)
     }
@@ -50,6 +50,19 @@ class PrimitiveVector[@specialized(Long, Int, Double) V: ClassTag](initialSize:
 
   def size: Int = _numElements
 
+  def iterator: Iterator[V] = new Iterator[V] {
+    var index = 0
+    override def hasNext: Boolean = index < _numElements
+    override def next(): V = {
+      if (!hasNext) {
+        throw new NoSuchElementException
+      }
+      val value = _array(index)
+      index += 1
+      value
+    }
+  }
+
   /** Gets the underlying array backing this vector. */
   def array: Array[V] = _array
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SizeTracker.scala b/core/src/main/scala/org/apache/spark/util/collection/SizeTracker.scala
new file mode 100644
index 0000000000000..3eb1010dc1e8d
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/SizeTracker.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import scala.collection.mutable
+
+import org.apache.spark.util.SizeEstimator
+
+/**
+ * A general interface for collections to keep track of their estimated sizes in bytes.
+ * We sample with a slow exponential back-off using the SizeEstimator to amortize the time,
+ * as each call to SizeEstimator is somewhat expensive (order of a few milliseconds).
+ */
+private[spark] trait SizeTracker {
+
+  import SizeTracker._
+
+  /**
+   * Controls the base of the exponential which governs the rate of sampling.
+   * E.g., a value of 2 would mean we sample at 1, 2, 4, 8, ... elements.
+   */
+  private val SAMPLE_GROWTH_RATE = 1.1
+
+  /** Samples taken since last resetSamples(). Only the last two are kept for extrapolation. */
+  private val samples = new mutable.Queue[Sample]
+
+  /** The average number of bytes per update between our last two samples. */
+  private var bytesPerUpdate: Double = _
+
+  /** Total number of insertions and updates into the map since the last resetSamples(). */
+  private var numUpdates: Long = _
+
+  /** The value of 'numUpdates' at which we will take our next sample. */
+  private var nextSampleNum: Long = _
+
+  resetSamples()
+
+  /**
+   * Reset samples collected so far.
+   * This should be called after the collection undergoes a dramatic change in size.
+   */
+  protected def resetSamples(): Unit = {
+    numUpdates = 1
+    nextSampleNum = 1
+    samples.clear()
+    takeSample()
+  }
+
+  /**
+   * Callback to be invoked after every update.
+   */
+  protected def afterUpdate(): Unit = {
+    numUpdates += 1
+    if (nextSampleNum == numUpdates) {
+      takeSample()
+    }
+  }
+
+  /**
+   * Take a new sample of the current collection's size.
+   */
+  private def takeSample(): Unit = {
+    samples.enqueue(Sample(SizeEstimator.estimate(this), numUpdates))
+    // Only use the last two samples to extrapolate
+    if (samples.size > 2) {
+      samples.dequeue()
+    }
+    val bytesDelta = samples.toList.reverse match {
+      case latest :: previous :: tail =>
+        (latest.size - previous.size).toDouble / (latest.numUpdates - previous.numUpdates)
+      // If fewer than 2 samples, assume no change
+      case _ => 0
+    }
+    bytesPerUpdate = math.max(0, bytesDelta)
+    nextSampleNum = math.ceil(numUpdates * SAMPLE_GROWTH_RATE).toLong
+  }
+
+  /**
+   * Estimate the current size of the collection in bytes. O(1) time.
+   */
+  def estimateSize(): Long = {
+    assert(samples.nonEmpty)
+    val extrapolatedDelta = bytesPerUpdate * (numUpdates - samples.last.numUpdates)
+    (samples.last.size + extrapolatedDelta).toLong
+  }
+}
+
+private object SizeTracker {
+  case class Sample(size: Long, numUpdates: Long)
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingAppendOnlyMap.scala
index 204330dad48b9..eb4de413867a0 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingAppendOnlyMap.scala
@@ -17,85 +17,25 @@
 
 package org.apache.spark.util.collection
 
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.util.SizeEstimator
-import org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.Sample
-
 /**
- * Append-only map that keeps track of its estimated size in bytes.
- * We sample with a slow exponential back-off using the SizeEstimator to amortize the time,
- * as each call to SizeEstimator can take a sizable amount of time (order of a few milliseconds).
+ * An append-only map that keeps track of its estimated size in bytes.
  */
-private[spark] class SizeTrackingAppendOnlyMap[K, V] extends AppendOnlyMap[K, V] {
-
-  /**
-   * Controls the base of the exponential which governs the rate of sampling.
-   * E.g., a value of 2 would mean we sample at 1, 2, 4, 8, ... elements.
-   */
-  private val SAMPLE_GROWTH_RATE = 1.1
-
-  /** All samples taken since last resetSamples(). Only the last two are used for extrapolation. */
-  private val samples = new ArrayBuffer[Sample]()
-
-  /** Total number of insertions and updates into the map since the last resetSamples(). */
-  private var numUpdates: Long = _
-
-  /** The value of 'numUpdates' at which we will take our next sample. */
-  private var nextSampleNum: Long = _
-
-  /** The average number of bytes per update between our last two samples. */
-  private var bytesPerUpdate: Double = _
-
-  resetSamples()
-
-  /** Called after the map grows in size, as this can be a dramatic change for small objects. */
-  def resetSamples() {
-    numUpdates = 1
-    nextSampleNum = 1
-    samples.clear()
-    takeSample()
-  }
-
+private[spark] class SizeTrackingAppendOnlyMap[K, V]
+  extends AppendOnlyMap[K, V] with SizeTracker with SizeTrackingPairCollection[K, V]
+{
   override def update(key: K, value: V): Unit = {
     super.update(key, value)
-    numUpdates += 1
-    if (nextSampleNum == numUpdates) { takeSample() }
+    super.afterUpdate()
   }
 
   override def changeValue(key: K, updateFunc: (Boolean, V) => V): V = {
     val newValue = super.changeValue(key, updateFunc)
-    numUpdates += 1
-    if (nextSampleNum == numUpdates) { takeSample() }
+    super.afterUpdate()
     newValue
   }
 
-  /** Takes a new sample of the current map's size. */
-  def takeSample() {
-    samples += Sample(SizeEstimator.estimate(this), numUpdates)
-    // Only use the last two samples to extrapolate. If fewer than 2 samples, assume no change.
-    bytesPerUpdate = math.max(0, samples.toSeq.reverse match {
-      case latest :: previous :: tail =>
-        (latest.size - previous.size).toDouble / (latest.numUpdates - previous.numUpdates)
-      case _ =>
-        0
-    })
-    nextSampleNum = math.ceil(numUpdates * SAMPLE_GROWTH_RATE).toLong
-  }
-
-  override protected def growTable() {
+  override protected def growTable(): Unit = {
     super.growTable()
     resetSamples()
   }
-
-  /** Estimates the current size of the map in bytes. O(1) time. */
-  def estimateSize(): Long = {
-    assert(samples.nonEmpty)
-    val extrapolatedDelta = bytesPerUpdate * (numUpdates - samples.last.numUpdates)
-    (samples.last.size + extrapolatedDelta).toLong
-  }
-}
-
-private object SizeTrackingAppendOnlyMap {
-  case class Sample(size: Long, numUpdates: Long)
 }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingPairBuffer.scala
new file mode 100644
index 0000000000000..9e9c16c5a2962
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingPairBuffer.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import java.util.Comparator
+
+/**
+ * Append-only buffer of key-value pairs that keeps track of its estimated size in bytes.
+ */
+private[spark] class SizeTrackingPairBuffer[K, V](initialCapacity: Int = 64)
+  extends SizeTracker with SizeTrackingPairCollection[K, V]
+{
+  require(initialCapacity <= (1 << 29), "Can't make capacity bigger than 2^29 elements")
+  require(initialCapacity >= 1, "Invalid initial capacity")
+
+  // Basic growable array data structure. We use a single array of AnyRef to hold both the keys
+  // and the values, so that we can sort them efficiently with KVArraySortDataFormat.
+  private var capacity = initialCapacity
+  private var curSize = 0
+  private var data = new Array[AnyRef](2 * initialCapacity)
+
+  /** Add an element into the buffer */
+  def insert(key: K, value: V): Unit = {
+    if (curSize == capacity) {
+      growArray()
+    }
+    data(2 * curSize) = key.asInstanceOf[AnyRef]
+    data(2 * curSize + 1) = value.asInstanceOf[AnyRef]
+    curSize += 1
+    afterUpdate()
+  }
+
+  /** Total number of elements in buffer */
+  override def size: Int = curSize
+
+  /** Iterate over the elements of the buffer */
+  override def iterator: Iterator[(K, V)] = new Iterator[(K, V)] {
+    var pos = 0
+
+    override def hasNext: Boolean = pos < curSize
+
+    override def next(): (K, V) = {
+      if (!hasNext) {
+        throw new NoSuchElementException
+      }
+      val pair = (data(2 * pos).asInstanceOf[K], data(2 * pos + 1).asInstanceOf[V])
+      pos += 1
+      pair
+    }
+  }
+
+  /** Double the size of the array because we've reached capacity */
+  private def growArray(): Unit = {
+    if (capacity == (1 << 29)) {
+      // Doubling the capacity would create an array bigger than Int.MaxValue, so don't
+      throw new Exception("Can't grow buffer beyond 2^29 elements")
+    }
+    val newCapacity = capacity * 2
+    val newArray = new Array[AnyRef](2 * newCapacity)
+    System.arraycopy(data, 0, newArray, 0, 2 * capacity)
+    data = newArray
+    capacity = newCapacity
+    resetSamples()
+  }
+
+  /** Iterate through the data in a given order. For this class this is not really destructive. */
+  override def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)] = {
+    new Sorter(new KVArraySortDataFormat[K, AnyRef]).sort(data, 0, curSize, keyComparator)
+    iterator
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingPairCollection.scala b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingPairCollection.scala
new file mode 100644
index 0000000000000..faa4e2b12ddb6
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingPairCollection.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import java.util.Comparator
+
+/**
+ * A common interface for our size-tracking collections of key-value pairs, which are used in
+ * external operations. These all support estimating the size and obtaining a memory-efficient
+ * sorted iterator.
+ */
+// TODO: should extend Iterable[Product2[K, V]] instead of (K, V)
+private[spark] trait SizeTrackingPairCollection[K, V] extends Iterable[(K, V)] {
+  /** Estimate the collection's current memory usage in bytes. */
+  def estimateSize(): Long
+
+  /** Iterate through the data in a given key order. This may destroy the underlying collection. */
+  def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)]
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingVector.scala b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingVector.scala
new file mode 100644
index 0000000000000..65a7b4e0d497b
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingVector.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import scala.reflect.ClassTag
+
+/**
+ * An append-only buffer that keeps track of its estimated size in bytes.
+ */
+private[spark] class SizeTrackingVector[T: ClassTag]
+  extends PrimitiveVector[T]
+  with SizeTracker {
+
+  override def +=(value: T): Unit = {
+    super.+=(value)
+    super.afterUpdate()
+  }
+
+  override def resize(newLength: Int): PrimitiveVector[T] = {
+    super.resize(newLength)
+    resetSamples()
+    this
+  }
+
+  /**
+   * Return a trimmed version of the underlying array.
+   */
+  def toArray: Array[T] = {
+    super.iterator.toArray
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala b/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala
new file mode 100644
index 0000000000000..ac1528969f0be
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import scala.reflect.ClassTag
+
+/**
+ * Abstraction for sorting an arbitrary input buffer of data. This interface requires determining
+ * the sort key for a given element index, as well as swapping elements and moving data from one
+ * buffer to another.
+ *
+ * Example format: an array of numbers, where each element is also the key.
+ * See [[KVArraySortDataFormat]] for a more exciting format.
+ *
+ * This trait extends Any to ensure it is universal (and thus compiled to a Java interface).
+ *
+ * @tparam K Type of the sort key of each element
+ * @tparam Buffer Internal data structure used by a particular format (e.g., Array[Int]).
+ */
+// TODO: Making Buffer a real trait would be a better abstraction, but adds some complexity.
+private[spark] trait SortDataFormat[K, Buffer] extends Any {
+  /** Return the sort key for the element at the given index. */
+  protected def getKey(data: Buffer, pos: Int): K
+
+  /** Swap two elements. */
+  protected def swap(data: Buffer, pos0: Int, pos1: Int): Unit
+
+  /** Copy a single element from src(srcPos) to dst(dstPos). */
+  protected def copyElement(src: Buffer, srcPos: Int, dst: Buffer, dstPos: Int): Unit
+
+  /**
+   * Copy a range of elements starting at src(srcPos) to dst, starting at dstPos.
+   * Overlapping ranges are allowed.
+   */
+  protected def copyRange(src: Buffer, srcPos: Int, dst: Buffer, dstPos: Int, length: Int): Unit
+
+  /**
+   * Allocates a Buffer that can hold up to 'length' elements.
+   * All elements of the buffer should be considered invalid until data is explicitly copied in.
+   */
+  protected def allocate(length: Int): Buffer
+}
+
+/**
+ * Supports sorting an array of key-value pairs where the elements of the array alternate between
+ * keys and values, as used in [[AppendOnlyMap]].
+ *
+ * @tparam K Type of the sort key of each element
+ * @tparam T Type of the Array we're sorting. Typically this must extend AnyRef, to support cases
+ *           when the keys and values are not the same type.
+ */
+private[spark]
+class KVArraySortDataFormat[K, T <: AnyRef : ClassTag] extends SortDataFormat[K, Array[T]] {
+
+  override protected def getKey(data: Array[T], pos: Int): K = data(2 * pos).asInstanceOf[K]
+
+  override protected def swap(data: Array[T], pos0: Int, pos1: Int) {
+    val tmpKey = data(2 * pos0)
+    val tmpVal = data(2 * pos0 + 1)
+    data(2 * pos0)     = data(2 * pos1)
+    data(2 * pos0 + 1) = data(2 * pos1 + 1)
+    data(2 * pos1)     = tmpKey
+    data(2 * pos1 + 1) = tmpVal
+  }
+
+  override protected def copyElement(src: Array[T], srcPos: Int, dst: Array[T], dstPos: Int) {
+    dst(2 * dstPos) = src(2 * srcPos)
+    dst(2 * dstPos + 1) = src(2 * srcPos + 1)
+  }
+
+  override protected def copyRange(src: Array[T], srcPos: Int,
+                                   dst: Array[T], dstPos: Int, length: Int) {
+    System.arraycopy(src, 2 * srcPos, dst, 2 * dstPos, 2 * length)
+  }
+
+  override protected def allocate(length: Int): Array[T] = {
+    new Array[T](2 * length)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
index a79e3ee756fc6..c9a864ae62778 100644
--- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -17,8 +17,54 @@
 
 package org.apache.spark.util.random
 
+import scala.reflect.ClassTag
+import scala.util.Random
+
 private[spark] object SamplingUtils {
 
+  /**
+   * Reservoir sampling implementation that also returns the input size.
+   *
+   * @param input input size
+   * @param k reservoir size
+   * @param seed random seed
+   * @return (samples, input size)
+   */
+  def reservoirSampleAndCount[T: ClassTag](
+      input: Iterator[T],
+      k: Int,
+      seed: Long = Random.nextLong())
+    : (Array[T], Int) = {
+    val reservoir = new Array[T](k)
+    // Put the first k elements in the reservoir.
+    var i = 0
+    while (i < k && input.hasNext) {
+      val item = input.next()
+      reservoir(i) = item
+      i += 1
+    }
+
+    // If we have consumed all the elements, return them. Otherwise do the replacement.
+    if (i < k) {
+      // If input size < k, trim the array to return only an array of input size.
+      val trimReservoir = new Array[T](i)
+      System.arraycopy(reservoir, 0, trimReservoir, 0, i)
+      (trimReservoir, i)
+    } else {
+      // If input size > k, continue the sampling process.
+      val rand = new XORShiftRandom(seed)
+      while (input.hasNext) {
+        val item = input.next()
+        val replacementIndex = rand.nextInt(i)
+        if (replacementIndex < k) {
+          reservoir(replacementIndex) = item
+        }
+        i += 1
+      }
+      (reservoir, i)
+    }
+  }
+
   /**
    * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of
    * the time.
@@ -35,6 +81,9 @@ private[spark] object SamplingUtils {
    *     ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
    *     rate, where success rate is defined the same as in sampling with replacement.
    *
+   * The smallest sampling rate supported is 1e-10 (in order to avoid running into the limit of the
+   * RNG's resolution).
+   *
    * @param sampleSizeLowerBound sample size
    * @param total size of RDD
    * @param withReplacement whether sampling with replacement
@@ -42,14 +91,73 @@ private[spark] object SamplingUtils {
    */
   def computeFractionForSampleSize(sampleSizeLowerBound: Int, total: Long,
       withReplacement: Boolean): Double = {
-    val fraction = sampleSizeLowerBound.toDouble / total
     if (withReplacement) {
-      val numStDev = if (sampleSizeLowerBound < 12) 9 else 5
-      fraction + numStDev * math.sqrt(fraction / total)
+      PoissonBounds.getUpperBound(sampleSizeLowerBound) / total
     } else {
-      val delta = 1e-4
-      val gamma = - math.log(delta) / total
-      math.min(1, fraction + gamma + math.sqrt(gamma * gamma + 2 * gamma * fraction))
+      val fraction = sampleSizeLowerBound.toDouble / total
+      BinomialBounds.getUpperBound(1e-4, total, fraction)
     }
   }
 }
+
+/**
+ * Utility functions that help us determine bounds on adjusted sampling rate to guarantee exact
+ * sample sizes with high confidence when sampling with replacement.
+ */
+private[spark] object PoissonBounds {
+
+  /**
+   * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda).
+   */
+  def getLowerBound(s: Double): Double = {
+    math.max(s - numStd(s) * math.sqrt(s), 1e-15)
+  }
+
+  /**
+   * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda).
+   *
+   * @param s sample size
+   */
+  def getUpperBound(s: Double): Double = {
+    math.max(s + numStd(s) * math.sqrt(s), 1e-10)
+  }
+
+  private def numStd(s: Double): Double = {
+    // TODO: Make it tighter.
+    if (s < 6.0) {
+      12.0
+    } else if (s < 16.0) {
+      9.0
+    } else {
+      6.0
+    }
+  }
+}
+
+/**
+ * Utility functions that help us determine bounds on adjusted sampling rate to guarantee exact
+ * sample size with high confidence when sampling without replacement.
+ */
+private[spark] object BinomialBounds {
+
+  val minSamplingRate = 1e-10
+
+  /**
+   * Returns a threshold `p` such that if we conduct n Bernoulli trials with success rate = `p`,
+   * it is very unlikely to have more than `fraction * n` successes.
+   */
+  def getLowerBound(delta: Double, n: Long, fraction: Double): Double = {
+    val gamma = - math.log(delta) / n * (2.0 / 3.0)
+    fraction + gamma - math.sqrt(gamma * gamma + 3 * gamma * fraction)
+  }
+
+  /**
+   * Returns a threshold `p` such that if we conduct n Bernoulli trials with success rate = `p`,
+   * it is very unlikely to have less than `fraction * n` successes.
+   */
+  def getUpperBound(delta: Double, n: Long, fraction: Double): Double = {
+    val gamma = - math.log(delta) / n
+    math.min(1,
+      math.max(minSamplingRate, fraction + gamma + math.sqrt(gamma * gamma + 2 * gamma * fraction)))
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
new file mode 100644
index 0000000000000..8f95d7c6b799b
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.random
+
+import scala.collection.Map
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import cern.jet.random.Poisson
+import cern.jet.random.engine.DRand
+
+import org.apache.spark.Logging
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+
+/**
+ * Auxiliary functions and data structures for the sampleByKey method in PairRDDFunctions.
+ *
+ * Essentially, when exact sample size is necessary, we make additional passes over the RDD to
+ * compute the exact threshold value to use for each stratum to guarantee exact sample size with
+ * high probability. This is achieved by maintaining a waitlist of size O(log(s)), where s is the
+ * desired sample size for each stratum.
+ *
+ * Like in simple random sampling, we generate a random value for each item from the
+ * uniform  distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist)
+ * are accepted into the sample instantly. The threshold for instant accept is designed so that
+ * s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a
+ * waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding
+ * a portion of the waitlist to the set of items that are instantly accepted. The exact threshold
+ * is computed by sorting the values in the waitlist and picking the value at (s - numAccepted).
+ *
+ * Note that since we use the same seed for the RNG when computing the thresholds and the actual
+ * sample, our computed thresholds are guaranteed to produce the desired sample size.
+ *
+ * For more theoretical background on the sampling techniques used here, please refer to
+ * http://jmlr.org/proceedings/papers/v28/meng13a.html
+ */
+
+private[spark] object StratifiedSamplingUtils extends Logging {
+
+  /**
+   * Count the number of items instantly accepted and generate the waitlist for each stratum.
+   *
+   * This is only invoked when exact sample size is required.
+   */
+  def getAcceptanceResults[K, V](rdd: RDD[(K, V)],
+      withReplacement: Boolean,
+      fractions: Map[K, Double],
+      counts: Option[Map[K, Long]],
+      seed: Long): mutable.Map[K, AcceptanceResult] = {
+    val combOp = getCombOp[K]
+    val mappedPartitionRDD = rdd.mapPartitionsWithIndex { case (partition, iter) =>
+      val zeroU: mutable.Map[K, AcceptanceResult] = new mutable.HashMap[K, AcceptanceResult]()
+      val rng = new RandomDataGenerator()
+      rng.reSeed(seed + partition)
+      val seqOp = getSeqOp(withReplacement, fractions, rng, counts)
+      Iterator(iter.aggregate(zeroU)(seqOp, combOp))
+    }
+    mappedPartitionRDD.reduce(combOp)
+  }
+
+  /**
+   * Returns the function used by aggregate to collect sampling statistics for each partition.
+   */
+  def getSeqOp[K, V](withReplacement: Boolean,
+      fractions: Map[K, Double],
+      rng: RandomDataGenerator,
+      counts: Option[Map[K, Long]]):
+    (mutable.Map[K, AcceptanceResult], (K, V)) => mutable.Map[K, AcceptanceResult] = {
+    val delta = 5e-5
+    (result: mutable.Map[K, AcceptanceResult], item: (K, V)) => {
+      val key = item._1
+      val fraction = fractions(key)
+      if (!result.contains(key)) {
+        result += (key -> new AcceptanceResult())
+      }
+      val acceptResult = result(key)
+
+      if (withReplacement) {
+        // compute acceptBound and waitListBound only if they haven't been computed already
+        // since they don't change from iteration to iteration.
+        // TODO change this to the streaming version
+        if (acceptResult.areBoundsEmpty) {
+          val n = counts.get(key)
+          val sampleSize = math.ceil(n * fraction).toLong
+          val lmbd1 = PoissonBounds.getLowerBound(sampleSize)
+          val lmbd2 = PoissonBounds.getUpperBound(sampleSize)
+          acceptResult.acceptBound = lmbd1 / n
+          acceptResult.waitListBound = (lmbd2 - lmbd1) / n
+        }
+        val acceptBound = acceptResult.acceptBound
+        val copiesAccepted = if (acceptBound == 0.0) 0L else rng.nextPoisson(acceptBound)
+        if (copiesAccepted > 0) {
+          acceptResult.numAccepted += copiesAccepted
+        }
+        val copiesWaitlisted = rng.nextPoisson(acceptResult.waitListBound)
+        if (copiesWaitlisted > 0) {
+          acceptResult.waitList ++= ArrayBuffer.fill(copiesWaitlisted)(rng.nextUniform())
+        }
+      } else {
+        // We use the streaming version of the algorithm for sampling without replacement to avoid
+        // using an extra pass over the RDD for computing the count.
+        // Hence, acceptBound and waitListBound change on every iteration.
+        acceptResult.acceptBound =
+          BinomialBounds.getLowerBound(delta, acceptResult.numItems, fraction)
+        acceptResult.waitListBound =
+          BinomialBounds.getUpperBound(delta, acceptResult.numItems, fraction)
+
+        val x = rng.nextUniform()
+        if (x < acceptResult.acceptBound) {
+          acceptResult.numAccepted += 1
+        } else if (x < acceptResult.waitListBound) {
+          acceptResult.waitList += x
+        }
+      }
+      acceptResult.numItems += 1
+      result
+    }
+  }
+
+  /**
+   * Returns the function used combine results returned by seqOp from different partitions.
+   */
+  def getCombOp[K]: (mutable.Map[K, AcceptanceResult], mutable.Map[K, AcceptanceResult])
+    => mutable.Map[K, AcceptanceResult] = {
+    (result1: mutable.Map[K, AcceptanceResult], result2: mutable.Map[K, AcceptanceResult]) => {
+      // take union of both key sets in case one partition doesn't contain all keys
+      result1.keySet.union(result2.keySet).foreach { key =>
+        // Use result2 to keep the combined result since r1 is usual empty
+        val entry1 = result1.get(key)
+        if (result2.contains(key)) {
+          result2(key).merge(entry1)
+        } else {
+          if (entry1.isDefined) {
+            result2 += (key -> entry1.get)
+          }
+        }
+      }
+      result2
+    }
+  }
+
+  /**
+   * Given the result returned by getCounts, determine the threshold for accepting items to
+   * generate exact sample size.
+   *
+   * To do so, we compute sampleSize = math.ceil(size * samplingRate) for each stratum and compare
+   * it to the number of items that were accepted instantly and the number of items in the waitlist
+   * for that stratum. Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted),
+   * which means we need to sort the elements in the waitlist by their associated values in order
+   * to find the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
+   * Note that all elements in the waitlist have values >= bound for instant accept, so a T value
+   * in the waitlist range would allow all elements that were instantly accepted on the first pass
+   * to be included in the sample.
+   */
+  def computeThresholdByKey[K](finalResult: Map[K, AcceptanceResult],
+      fractions: Map[K, Double]): Map[K, Double] = {
+    val thresholdByKey = new mutable.HashMap[K, Double]()
+    for ((key, acceptResult) <- finalResult) {
+      val sampleSize = math.ceil(acceptResult.numItems * fractions(key)).toLong
+      if (acceptResult.numAccepted > sampleSize) {
+        logWarning("Pre-accepted too many")
+        thresholdByKey += (key -> acceptResult.acceptBound)
+      } else {
+        val numWaitListAccepted = (sampleSize - acceptResult.numAccepted).toInt
+        if (numWaitListAccepted >= acceptResult.waitList.size) {
+          logWarning("WaitList too short")
+          thresholdByKey += (key -> acceptResult.waitListBound)
+        } else {
+          thresholdByKey += (key -> acceptResult.waitList.sorted.apply(numWaitListAccepted))
+        }
+      }
+    }
+    thresholdByKey
+  }
+
+  /**
+   * Return the per partition sampling function used for sampling without replacement.
+   *
+   * When exact sample size is required, we make an additional pass over the RDD to determine the
+   * exact sampling rate that guarantees sample size with high confidence.
+   *
+   * The sampling function has a unique seed per partition.
+   */
+  def getBernoulliSamplingFunction[K, V](rdd: RDD[(K,  V)],
+      fractions: Map[K, Double],
+      exact: Boolean,
+      seed: Long): (Int, Iterator[(K, V)]) => Iterator[(K, V)] = {
+    var samplingRateByKey = fractions
+    if (exact) {
+      // determine threshold for each stratum and resample
+      val finalResult = getAcceptanceResults(rdd, false, fractions, None, seed)
+      samplingRateByKey = computeThresholdByKey(finalResult, fractions)
+    }
+    (idx: Int, iter: Iterator[(K, V)]) => {
+      val rng = new RandomDataGenerator
+      rng.reSeed(seed + idx)
+      // Must use the same invoke pattern on the rng as in getSeqOp for without replacement
+      // in order to generate the same sequence of random numbers when creating the sample
+      iter.filter(t => rng.nextUniform() < samplingRateByKey(t._1))
+    }
+  }
+
+  /**
+   * Return the per partition sampling function used for sampling with replacement.
+   *
+   * When exact sample size is required, we make two additional passed over the RDD to determine
+   * the exact sampling rate that guarantees sample size with high confidence. The first pass
+   * counts the number of items in each stratum (group of items with the same key) in the RDD, and
+   * the second pass uses the counts to determine exact sampling rates.
+   *
+   * The sampling function has a unique seed per partition.
+   */
+  def getPoissonSamplingFunction[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)],
+      fractions: Map[K, Double],
+      exact: Boolean,
+      seed: Long): (Int, Iterator[(K, V)]) => Iterator[(K, V)] = {
+    // TODO implement the streaming version of sampling w/ replacement that doesn't require counts
+    if (exact) {
+      val counts = Some(rdd.countByKey())
+      val finalResult = getAcceptanceResults(rdd, true, fractions, counts, seed)
+      val thresholdByKey = computeThresholdByKey(finalResult, fractions)
+      (idx: Int, iter: Iterator[(K, V)]) => {
+        val rng = new RandomDataGenerator()
+        rng.reSeed(seed + idx)
+        iter.flatMap { item =>
+          val key = item._1
+          val acceptBound = finalResult(key).acceptBound
+          // Must use the same invoke pattern on the rng as in getSeqOp for with replacement
+          // in order to generate the same sequence of random numbers when creating the sample
+          val copiesAccepted = if (acceptBound == 0) 0L else rng.nextPoisson(acceptBound)
+          val copiesWailisted = rng.nextPoisson(finalResult(key).waitListBound)
+          val copiesInSample = copiesAccepted +
+            (0 until copiesWailisted).count(i => rng.nextUniform() < thresholdByKey(key))
+          if (copiesInSample > 0) {
+            Iterator.fill(copiesInSample.toInt)(item)
+          } else {
+            Iterator.empty
+          }
+        }
+      }
+    } else {
+      (idx: Int, iter: Iterator[(K, V)]) => {
+        val rng = new RandomDataGenerator()
+        rng.reSeed(seed + idx)
+        iter.flatMap { item =>
+          val count = rng.nextPoisson(fractions(item._1))
+          if (count > 0) {
+            Iterator.fill(count)(item)
+          } else {
+            Iterator.empty
+          }
+        }
+      }
+    }
+  }
+
+  /** A random data generator that generates both uniform values and Poisson values. */
+  private class RandomDataGenerator {
+    val uniform = new XORShiftRandom()
+    var poisson = new Poisson(1.0, new DRand)
+
+    def reSeed(seed: Long) {
+      uniform.setSeed(seed)
+      poisson = new Poisson(1.0, new DRand(seed.toInt))
+    }
+
+    def nextPoisson(mean: Double): Int = {
+      poisson.nextInt(mean)
+    }
+
+    def nextUniform(): Double = {
+      uniform.nextDouble()
+    }
+  }
+}
+
+/**
+ * Object used by seqOp to keep track of the number of items accepted and items waitlisted per
+ * stratum, as well as the bounds for accepting and waitlisting items.
+ *
+ * `[random]` here is necessary since it's in the return type signature of seqOp defined above
+ */
+private[random] class AcceptanceResult(var numItems: Long = 0L, var numAccepted: Long = 0L)
+  extends Serializable {
+
+  val waitList = new ArrayBuffer[Double]
+  var acceptBound: Double = Double.NaN // upper bound for accepting item instantly
+  var waitListBound: Double = Double.NaN // upper bound for adding item to waitlist
+
+  def areBoundsEmpty = acceptBound.isNaN || waitListBound.isNaN
+
+  def merge(other: Option[AcceptanceResult]): Unit = {
+    if (other.isDefined) {
+      waitList ++= other.get.waitList
+      numAccepted += other.get.numAccepted
+      numItems += other.get.numItems
+    }
+  }
+}
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index b2868b59ce6c6..56150caa5d6ba 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -25,18 +25,23 @@
 import scala.Tuple3;
 import scala.Tuple4;
 
-
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
 import com.google.common.base.Optional;
 import com.google.common.base.Charsets;
 import com.google.common.io.Files;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextInputFormat;
 import org.apache.hadoop.mapreduce.Job;
 import org.junit.After;
 import org.junit.Assert;
@@ -44,6 +49,7 @@
 import org.junit.Test;
 
 import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaHadoopRDD;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -118,8 +124,7 @@ public void intersection() {
     JavaRDD<Integer> intersections = s1.intersection(s2);
     Assert.assertEquals(3, intersections.count());
 
-    List<Integer> list = new ArrayList<Integer>();
-    JavaRDD<Integer> empty = sc.parallelize(list);
+    JavaRDD<Integer> empty = sc.emptyRDD();
     JavaRDD<Integer> emptyIntersection = empty.intersection(s2);
     Assert.assertEquals(0, emptyIntersection.count());
 
@@ -184,6 +189,12 @@ public void sortByKey() {
     Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
   }
 
+  @Test
+  public void emptyRDD() {
+    JavaRDD<String> rdd = sc.emptyRDD();
+    Assert.assertEquals("Empty RDD shouldn't have any values", 0, rdd.count());
+  }
+
   @Test
   public void sortBy() {
     List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
@@ -1203,4 +1214,76 @@ public Tuple2<Integer, int[]> call(Integer x) {
     pairRDD.collect();  // Works fine
     pairRDD.collectAsMap();  // Used to crash with ClassCastException
   }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void sampleByKey() {
+    JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8), 3);
+    JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(
+      new PairFunction<Integer, Integer, Integer>() {
+        @Override
+        public Tuple2<Integer, Integer> call(Integer i) {
+          return new Tuple2<Integer, Integer>(i % 2, 1);
+        }
+      });
+    Map<Integer, Object> fractions = Maps.newHashMap();
+    fractions.put(0, 0.5);
+    fractions.put(1, 1.0);
+    JavaPairRDD<Integer, Integer> wr = rdd2.sampleByKey(true, fractions, 1L);
+    Map<Integer, Long> wrCounts = (Map<Integer, Long>) (Object) wr.countByKey();
+    Assert.assertTrue(wrCounts.size() == 2);
+    Assert.assertTrue(wrCounts.get(0) > 0);
+    Assert.assertTrue(wrCounts.get(1) > 0);
+    JavaPairRDD<Integer, Integer> wor = rdd2.sampleByKey(false, fractions, 1L);
+    Map<Integer, Long> worCounts = (Map<Integer, Long>) (Object) wor.countByKey();
+    Assert.assertTrue(worCounts.size() == 2);
+    Assert.assertTrue(worCounts.get(0) > 0);
+    Assert.assertTrue(worCounts.get(1) > 0);
+    JavaPairRDD<Integer, Integer> wrExact = rdd2.sampleByKey(true, fractions, true, 1L);
+    Map<Integer, Long> wrExactCounts = (Map<Integer, Long>) (Object) wrExact.countByKey();
+    Assert.assertTrue(wrExactCounts.size() == 2);
+    Assert.assertTrue(wrExactCounts.get(0) == 2);
+    Assert.assertTrue(wrExactCounts.get(1) == 4);
+    JavaPairRDD<Integer, Integer> worExact = rdd2.sampleByKey(false, fractions, true, 1L);
+    Map<Integer, Long> worExactCounts = (Map<Integer, Long>) (Object) worExact.countByKey();
+    Assert.assertTrue(worExactCounts.size() == 2);
+    Assert.assertTrue(worExactCounts.get(0) == 2);
+    Assert.assertTrue(worExactCounts.get(1) == 4);
+  }
+
+  private static class SomeCustomClass implements Serializable {
+    public SomeCustomClass() {
+      // Intentionally left blank
+    }
+  }
+
+  @Test
+  public void collectUnderlyingScalaRDD() {
+    List<SomeCustomClass> data = new ArrayList<SomeCustomClass>();
+    for (int i = 0; i < 100; i++) {
+      data.add(new SomeCustomClass());
+    }
+    JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
+    SomeCustomClass[] collected = (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
+    Assert.assertEquals(data.size(), collected.length);
+  }
+
+  public void getHadoopInputSplits() {
+    String outDir = new File(tempDir, "output").getAbsolutePath();
+    sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 2).saveAsTextFile(outDir);
+
+    JavaHadoopRDD<LongWritable, Text> hadoopRDD = (JavaHadoopRDD<LongWritable, Text>)
+        sc.hadoopFile(outDir, TextInputFormat.class, LongWritable.class, Text.class);
+    List<String> inputPaths = hadoopRDD.mapPartitionsWithInputSplit(
+        new Function2<InputSplit, Iterator<Tuple2<LongWritable, Text>>, Iterator<String>>() {
+      @Override
+      public Iterator<String> call(InputSplit split, Iterator<Tuple2<LongWritable, Text>> it)
+          throws Exception {
+        FileSplit fileSplit = (FileSplit) split;
+        return Lists.newArrayList(fileSplit.getPath().toUri().getPath()).iterator();
+      }
+    }, true).collect();
+    Assert.assertEquals(Sets.newHashSet(inputPaths),
+        Sets.newHashSet(outDir + "/part-00000", outDir + "/part-00001"));
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index 3aab88e9e9196..52d1d5277658e 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -61,7 +61,7 @@ class AccumulatorSuite extends FunSuite with Matchers with LocalSparkContext {
     val acc : Accumulator[Int] = sc.accumulator(0)
 
     val d = sc.parallelize(1 to 20)
-    evaluating {d.foreach{x => acc.value = x}} should produce [Exception]
+    an [Exception] should be thrownBy {d.foreach{x => acc.value = x}}
   }
 
   test ("add value to collection accumulators") {
@@ -87,11 +87,11 @@ class AccumulatorSuite extends FunSuite with Matchers with LocalSparkContext {
       sc = new SparkContext("local[" + nThreads + "]", "test")
       val acc: Accumulable[mutable.Set[Any], Any] = sc.accumulable(new mutable.HashSet[Any]())
       val d = sc.parallelize(1 to maxI)
-      evaluating {
+      an [SparkException] should be thrownBy {
         d.foreach {
           x => acc.value += x
         }
-      } should produce [SparkException]
+      }
       resetSparkContext()
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
index 7f5d0b061e8b0..9c5f394d3899d 100644
--- a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.scalatest.{BeforeAndAfter, FunSuite}
 import org.scalatest.mock.EasyMockSugar
 
@@ -52,22 +50,21 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
   }
 
   test("get uncached rdd") {
-    expecting {
-      blockManager.get(RDDBlockId(0, 0)).andReturn(None)
-      blockManager.put(RDDBlockId(0, 0), ArrayBuffer[Any](1, 2, 3, 4), StorageLevel.MEMORY_ONLY,
-        true).andStubReturn(Seq[(BlockId, BlockStatus)]())
-    }
-
-    whenExecuting(blockManager) {
-      val context = new TaskContext(0, 0, 0)
-      val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
-      assert(value.toList === List(1, 2, 3, 4))
-    }
+    // Do not mock this test, because attempting to match Array[Any], which is not covariant,
+    // in blockManager.put is a losing battle. You have been warned.
+    blockManager = sc.env.blockManager
+    cacheManager = sc.env.cacheManager
+    val context = new TaskContext(0, 0, 0)
+    val computeValue = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
+    val getValue = blockManager.get(RDDBlockId(rdd.id, split.index))
+    assert(computeValue.toList === List(1, 2, 3, 4))
+    assert(getValue.isDefined, "Block cached from getOrCompute is not found!")
+    assert(getValue.get.data.toList === List(1, 2, 3, 4))
   }
 
   test("get cached rdd") {
     expecting {
-      val result = new BlockResult(ArrayBuffer(5, 6, 7).iterator, DataReadMethod.Memory, 12)
+      val result = new BlockResult(Array(5, 6, 7).iterator, DataReadMethod.Memory, 12)
       blockManager.get(RDDBlockId(0, 0)).andReturn(Some(result))
     }
 
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index fc00458083a33..a41914a1a9d0c 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -99,7 +99,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
   test("ShuffledRDD") {
     testRDD(rdd => {
       // Creating ShuffledRDD directly as PairRDDFunctions.combineByKey produces a MapPartitionedRDD
-      new ShuffledRDD[Int, Int, Int, (Int, Int)](rdd.map(x => (x % 2, 1)), partitioner)
+      new ShuffledRDD[Int, Int, Int](rdd.map(x => (x % 2, 1)), partitioner)
     })
   }
 
@@ -156,15 +156,20 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
 
   test("CoGroupedRDD") {
     val longLineageRDD1 = generateFatPairRDD()
+
+    // Collect the RDD as sequences instead of arrays to enable equality tests in testRDD
+    val seqCollectFunc = (rdd: RDD[(Int, Array[Iterable[Int]])]) =>
+      rdd.map{case (p, a) => (p, a.toSeq)}.collect(): Any
+
     testRDD(rdd => {
       CheckpointSuite.cogroup(longLineageRDD1, rdd.map(x => (x % 2, 1)), partitioner)
-    })
+    }, seqCollectFunc)
 
     val longLineageRDD2 = generateFatPairRDD()
     testRDDPartitions(rdd => {
       CheckpointSuite.cogroup(
         longLineageRDD2, sc.makeRDD(1 to 2, 2).map(x => (x % 2, 1)), partitioner)
-    })
+    }, seqCollectFunc)
   }
 
   test("ZippedPartitionsRDD") {
@@ -235,12 +240,19 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
     assert(rdd.partitions.size === 0)
   }
 
+  def defaultCollectFunc[T](rdd: RDD[T]): Any = rdd.collect()
+
   /**
    * Test checkpointing of the RDD generated by the given operation. It tests whether the
    * serialized size of the RDD is reduce after checkpointing or not. This function should be called
    * on all RDDs that have a parent RDD (i.e., do not call on ParallelCollection, BlockRDD, etc.).
+   *
+   * @param op an operation to run on the RDD
+   * @param collectFunc a function for collecting the values in the RDD, in case there are
+   *   non-comparable types like arrays that we want to convert to something that supports ==
    */
-  def testRDD[U: ClassTag](op: (RDD[Int]) => RDD[U]) {
+  def testRDD[U: ClassTag](op: (RDD[Int]) => RDD[U],
+      collectFunc: RDD[U] => Any = defaultCollectFunc[U] _) {
     // Generate the final RDD using given RDD operation
     val baseRDD = generateFatRDD()
     val operatedRDD = op(baseRDD)
@@ -258,13 +270,13 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
     logInfo("RDD after checkpoint: " + operatedRDD + "\n" + operatedRDD.toDebugString)
     val (rddSizeBeforeCheckpoint, partitionSizeBeforeCheckpoint) = getSerializedSizes(operatedRDD)
     operatedRDD.checkpoint()
-    val result = operatedRDD.collect()
+    val result = collectFunc(operatedRDD)
     operatedRDD.collect() // force re-initialization of post-checkpoint lazy variables
     val (rddSizeAfterCheckpoint, partitionSizeAfterCheckpoint) = getSerializedSizes(operatedRDD)
     logInfo("RDD after checkpoint: " + operatedRDD + "\n" + operatedRDD.toDebugString)
 
     // Test whether the checkpoint file has been created
-    assert(sc.checkpointFile[U](operatedRDD.getCheckpointFile.get).collect() === result)
+    assert(collectFunc(sc.checkpointFile[U](operatedRDD.getCheckpointFile.get)) === result)
 
     // Test whether dependencies have been changed from its earlier parent RDD
     assert(operatedRDD.dependencies.head.rdd != parentRDD)
@@ -279,7 +291,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
     assert(operatedRDD.partitions.length === numPartitions)
 
     // Test whether the data in the checkpointed RDD is same as original
-    assert(operatedRDD.collect() === result)
+    assert(collectFunc(operatedRDD) === result)
 
     // Test whether serialized size of the RDD has reduced.
     logInfo("Size of " + rddType +
@@ -289,7 +301,6 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
       "Size of " + rddType + " did not reduce after checkpointing " +
         " [" + rddSizeBeforeCheckpoint + " --> " + rddSizeAfterCheckpoint + "]"
     )
-
   }
 
   /**
@@ -300,8 +311,12 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
    * This function should be called only those RDD whose partitions refer to parent RDD's
    * partitions (i.e., do not call it on simple RDD like MappedRDD).
    *
+   * @param op an operation to run on the RDD
+   * @param collectFunc a function for collecting the values in the RDD, in case there are
+   *   non-comparable types like arrays that we want to convert to something that supports ==
    */
-  def testRDDPartitions[U: ClassTag](op: (RDD[Int]) => RDD[U]) {
+  def testRDDPartitions[U: ClassTag](op: (RDD[Int]) => RDD[U],
+       collectFunc: RDD[U] => Any = defaultCollectFunc[U] _) {
     // Generate the final RDD using given RDD operation
     val baseRDD = generateFatRDD()
     val operatedRDD = op(baseRDD)
@@ -316,13 +331,13 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
     logInfo("RDD after checkpoint: " + operatedRDD + "\n" + operatedRDD.toDebugString)
     val (rddSizeBeforeCheckpoint, partitionSizeBeforeCheckpoint) = getSerializedSizes(operatedRDD)
     parentRDDs.foreach(_.checkpoint())  // checkpoint the parent RDD, not the generated one
-    val result = operatedRDD.collect()  // force checkpointing
+    val result = collectFunc(operatedRDD)  // force checkpointing
     operatedRDD.collect() // force re-initialization of post-checkpoint lazy variables
     val (rddSizeAfterCheckpoint, partitionSizeAfterCheckpoint) = getSerializedSizes(operatedRDD)
     logInfo("RDD after checkpoint: " + operatedRDD + "\n" + operatedRDD.toDebugString)
 
     // Test whether the data in the checkpointed RDD is same as original
-    assert(operatedRDD.collect() === result)
+    assert(collectFunc(operatedRDD) === result)
 
     // Test whether serialized size of the partitions has reduced
     logInfo("Size of partitions of " + rddType +
@@ -436,7 +451,7 @@ object CheckpointSuite {
     new CoGroupedRDD[K](
       Seq(first.asInstanceOf[RDD[(K, _)]], second.asInstanceOf[RDD[(K, _)]]),
       part
-    ).asInstanceOf[RDD[(K, Seq[Seq[V]])]]
+    ).asInstanceOf[RDD[(K, Array[Iterable[V]])]]
   }
 
 }
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 13b415cccb647..4bc4346c0a288 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -31,15 +31,28 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.{BlockId, BroadcastBlockId, RDDBlockId, ShuffleBlockId}
-
-class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
-
+import org.apache.spark.storage._
+import org.apache.spark.shuffle.hash.HashShuffleManager
+import org.apache.spark.shuffle.sort.SortShuffleManager
+import org.apache.spark.storage.BroadcastBlockId
+import org.apache.spark.storage.RDDBlockId
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.storage.ShuffleIndexBlockId
+
+/**
+ * An abstract base class for context cleaner tests, which sets up a context with a config
+ * suitable for cleaner tests and provides some utility functions. Subclasses can use different
+ * config options, in particular, a different shuffle manager class
+ */
+abstract class ContextCleanerSuiteBase(val shuffleManager: Class[_] = classOf[HashShuffleManager])
+  extends FunSuite with BeforeAndAfter with LocalSparkContext
+{
   implicit val defaultTimeout = timeout(10000 millis)
   val conf = new SparkConf()
     .setMaster("local[2]")
     .setAppName("ContextCleanerSuite")
     .set("spark.cleaner.referenceTracking.blocking", "true")
+    .set("spark.shuffle.manager", shuffleManager.getName)
 
   before {
     sc = new SparkContext(conf)
@@ -52,9 +65,61 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
     }
   }
 
+  //------ Helper functions ------
+
+  protected def newRDD() = sc.makeRDD(1 to 10)
+  protected def newPairRDD() = newRDD().map(_ -> 1)
+  protected def newShuffleRDD() = newPairRDD().reduceByKey(_ + _)
+  protected def newBroadcast() = sc.broadcast(1 to 100)
+
+  protected def newRDDWithShuffleDependencies(): (RDD[_], Seq[ShuffleDependency[_, _, _]]) = {
+    def getAllDependencies(rdd: RDD[_]): Seq[Dependency[_]] = {
+      rdd.dependencies ++ rdd.dependencies.flatMap { dep =>
+        getAllDependencies(dep.rdd)
+      }
+    }
+    val rdd = newShuffleRDD()
+
+    // Get all the shuffle dependencies
+    val shuffleDeps = getAllDependencies(rdd)
+      .filter(_.isInstanceOf[ShuffleDependency[_, _, _]])
+      .map(_.asInstanceOf[ShuffleDependency[_, _, _]])
+    (rdd, shuffleDeps)
+  }
+
+  protected def randomRdd() = {
+    val rdd: RDD[_] = Random.nextInt(3) match {
+      case 0 => newRDD()
+      case 1 => newShuffleRDD()
+      case 2 => newPairRDD.join(newPairRDD())
+    }
+    if (Random.nextBoolean()) rdd.persist()
+    rdd.count()
+    rdd
+  }
+
+  /** Run GC and make sure it actually has run */
+  protected def runGC() {
+    val weakRef = new WeakReference(new Object())
+    val startTime = System.currentTimeMillis
+    System.gc() // Make a best effort to run the garbage collection. It *usually* runs GC.
+    // Wait until a weak reference object has been GCed
+    while (System.currentTimeMillis - startTime < 10000 && weakRef.get != null) {
+      System.gc()
+      Thread.sleep(200)
+    }
+  }
+
+  protected def cleaner = sc.cleaner.get
+}
+
 
+/**
+ * Basic ContextCleanerSuite, which uses sort-based shuffle
+ */
+class ContextCleanerSuite extends ContextCleanerSuiteBase {
   test("cleanup RDD") {
-    val rdd = newRDD.persist()
+    val rdd = newRDD().persist()
     val collected = rdd.collect().toList
     val tester = new CleanerTester(sc, rddIds = Seq(rdd.id))
 
@@ -67,7 +132,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
   }
 
   test("cleanup shuffle") {
-    val (rdd, shuffleDeps) = newRDDWithShuffleDependencies
+    val (rdd, shuffleDeps) = newRDDWithShuffleDependencies()
     val collected = rdd.collect().toList
     val tester = new CleanerTester(sc, shuffleIds = shuffleDeps.map(_.shuffleId))
 
@@ -80,7 +145,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
   }
 
   test("cleanup broadcast") {
-    val broadcast = newBroadcast
+    val broadcast = newBroadcast()
     val tester = new CleanerTester(sc, broadcastIds = Seq(broadcast.id))
 
     // Explicit cleanup
@@ -89,7 +154,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
   }
 
   test("automatically cleanup RDD") {
-    var rdd = newRDD.persist()
+    var rdd = newRDD().persist()
     rdd.count()
 
     // Test that GC does not cause RDD cleanup due to a strong reference
@@ -107,7 +172,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
   }
 
   test("automatically cleanup shuffle") {
-    var rdd = newShuffleRDD
+    var rdd = newShuffleRDD()
     rdd.count()
 
     // Test that GC does not cause shuffle cleanup due to a strong reference
@@ -125,7 +190,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
   }
 
   test("automatically cleanup broadcast") {
-    var broadcast = newBroadcast
+    var broadcast = newBroadcast()
 
     // Test that GC does not cause broadcast cleanup due to a strong reference
     val preGCTester =  new CleanerTester(sc, broadcastIds = Seq(broadcast.id))
@@ -144,11 +209,11 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
   test("automatically cleanup RDD + shuffle + broadcast") {
     val numRdds = 100
     val numBroadcasts = 4 // Broadcasts are more costly
-    val rddBuffer = (1 to numRdds).map(i => randomRdd).toBuffer
-    val broadcastBuffer = (1 to numBroadcasts).map(i => randomBroadcast).toBuffer
+    val rddBuffer = (1 to numRdds).map(i => randomRdd()).toBuffer
+    val broadcastBuffer = (1 to numBroadcasts).map(i => newBroadcast()).toBuffer
     val rddIds = sc.persistentRdds.keys.toSeq
     val shuffleIds = 0 until sc.newShuffleId
-    val broadcastIds = 0L until numBroadcasts
+    val broadcastIds = broadcastBuffer.map(_.id)
 
     val preGCTester =  new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
     runGC()
@@ -162,6 +227,13 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
     rddBuffer.clear()
     runGC()
     postGCTester.assertCleanup()
+
+    // Make sure the broadcasted task closure no longer exists after GC.
+    val taskClosureBroadcastId = broadcastIds.max + 1
+    assert(sc.env.blockManager.master.getMatchingBlockIds({
+      case BroadcastBlockId(`taskClosureBroadcastId`, _) => true
+      case _ => false
+    }, askSlaves = true).isEmpty)
   }
 
   test("automatically cleanup RDD + shuffle + broadcast in distributed mode") {
@@ -171,15 +243,16 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
       .setMaster("local-cluster[2, 1, 512]")
       .setAppName("ContextCleanerSuite")
       .set("spark.cleaner.referenceTracking.blocking", "true")
+      .set("spark.shuffle.manager", shuffleManager.getName)
     sc = new SparkContext(conf2)
 
     val numRdds = 10
     val numBroadcasts = 4 // Broadcasts are more costly
-    val rddBuffer = (1 to numRdds).map(i => randomRdd).toBuffer
-    val broadcastBuffer = (1 to numBroadcasts).map(i => randomBroadcast).toBuffer
+    val rddBuffer = (1 to numRdds).map(i => randomRdd()).toBuffer
+    val broadcastBuffer = (1 to numBroadcasts).map(i => newBroadcast()).toBuffer
     val rddIds = sc.persistentRdds.keys.toSeq
     val shuffleIds = 0 until sc.newShuffleId
-    val broadcastIds = 0L until numBroadcasts
+    val broadcastIds = broadcastBuffer.map(_.id)
 
     val preGCTester = new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
     runGC()
@@ -193,57 +266,90 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
     rddBuffer.clear()
     runGC()
     postGCTester.assertCleanup()
+
+    // Make sure the broadcasted task closure no longer exists after GC.
+    val taskClosureBroadcastId = broadcastIds.max + 1
+    assert(sc.env.blockManager.master.getMatchingBlockIds({
+      case BroadcastBlockId(`taskClosureBroadcastId`, _) => true
+      case _ => false
+    }, askSlaves = true).isEmpty)
   }
+}
 
-  //------ Helper functions ------
 
-  def newRDD = sc.makeRDD(1 to 10)
-  def newPairRDD = newRDD.map(_ -> 1)
-  def newShuffleRDD = newPairRDD.reduceByKey(_ + _)
-  def newBroadcast = sc.broadcast(1 to 100)
-  def newRDDWithShuffleDependencies: (RDD[_], Seq[ShuffleDependency[_, _, _]]) = {
-    def getAllDependencies(rdd: RDD[_]): Seq[Dependency[_]] = {
-      rdd.dependencies ++ rdd.dependencies.flatMap { dep =>
-        getAllDependencies(dep.rdd)
-      }
-    }
-    val rdd = newShuffleRDD
+/**
+ * A copy of the shuffle tests for sort-based shuffle
+ */
+class SortShuffleContextCleanerSuite extends ContextCleanerSuiteBase(classOf[SortShuffleManager]) {
+  test("cleanup shuffle") {
+    val (rdd, shuffleDeps) = newRDDWithShuffleDependencies()
+    val collected = rdd.collect().toList
+    val tester = new CleanerTester(sc, shuffleIds = shuffleDeps.map(_.shuffleId))
 
-    // Get all the shuffle dependencies
-    val shuffleDeps = getAllDependencies(rdd)
-      .filter(_.isInstanceOf[ShuffleDependency[_, _, _]])
-      .map(_.asInstanceOf[ShuffleDependency[_, _, _]])
-    (rdd, shuffleDeps)
+    // Explicit cleanup
+    shuffleDeps.foreach(s => cleaner.doCleanupShuffle(s.shuffleId, blocking = true))
+    tester.assertCleanup()
+
+    // Verify that shuffles can be re-executed after cleaning up
+    assert(rdd.collect().toList.equals(collected))
   }
 
-  def randomRdd = {
-    val rdd: RDD[_] = Random.nextInt(3) match {
-      case 0 => newRDD
-      case 1 => newShuffleRDD
-      case 2 => newPairRDD.join(newPairRDD)
-    }
-    if (Random.nextBoolean()) rdd.persist()
+  test("automatically cleanup shuffle") {
+    var rdd = newShuffleRDD()
     rdd.count()
-    rdd
-  }
 
-  def randomBroadcast = {
-    sc.broadcast(Random.nextInt(Int.MaxValue))
+    // Test that GC does not cause shuffle cleanup due to a strong reference
+    val preGCTester = new CleanerTester(sc, shuffleIds = Seq(0))
+    runGC()
+    intercept[Exception] {
+      preGCTester.assertCleanup()(timeout(1000 millis))
+    }
+
+    // Test that GC causes shuffle cleanup after dereferencing the RDD
+    val postGCTester = new CleanerTester(sc, shuffleIds = Seq(0))
+    rdd = null  // Make RDD out of scope, so that corresponding shuffle goes out of scope
+    runGC()
+    postGCTester.assertCleanup()
   }
 
-  /** Run GC and make sure it actually has run */
-  def runGC() {
-    val weakRef = new WeakReference(new Object())
-    val startTime = System.currentTimeMillis
-    System.gc() // Make a best effort to run the garbage collection. It *usually* runs GC.
-    // Wait until a weak reference object has been GCed
-    while(System.currentTimeMillis - startTime < 10000 && weakRef.get != null) {
-      System.gc()
-      Thread.sleep(200)
+  test("automatically cleanup RDD + shuffle + broadcast in distributed mode") {
+    sc.stop()
+
+    val conf2 = new SparkConf()
+      .setMaster("local-cluster[2, 1, 512]")
+      .setAppName("ContextCleanerSuite")
+      .set("spark.cleaner.referenceTracking.blocking", "true")
+      .set("spark.shuffle.manager", shuffleManager.getName)
+    sc = new SparkContext(conf2)
+
+    val numRdds = 10
+    val numBroadcasts = 4 // Broadcasts are more costly
+    val rddBuffer = (1 to numRdds).map(i => randomRdd).toBuffer
+    val broadcastBuffer = (1 to numBroadcasts).map(i => newBroadcast).toBuffer
+    val rddIds = sc.persistentRdds.keys.toSeq
+    val shuffleIds = 0 until sc.newShuffleId()
+    val broadcastIds = broadcastBuffer.map(_.id)
+
+    val preGCTester = new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
+    runGC()
+    intercept[Exception] {
+      preGCTester.assertCleanup()(timeout(1000 millis))
     }
-  }
 
-  def cleaner = sc.cleaner.get
+    // Test that GC triggers the cleanup of all variables after the dereferencing them
+    val postGCTester = new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
+    broadcastBuffer.clear()
+    rddBuffer.clear()
+    runGC()
+    postGCTester.assertCleanup()
+
+    // Make sure the broadcasted task closure no longer exists after GC.
+    val taskClosureBroadcastId = broadcastIds.max + 1
+    assert(sc.env.blockManager.master.getMatchingBlockIds({
+      case BroadcastBlockId(`taskClosureBroadcastId`, _) => true
+      case _ => false
+    }, askSlaves = true).isEmpty)
+  }
 }
 
 
@@ -401,6 +507,7 @@ class CleanerTester(
   private def getShuffleBlocks(shuffleId: Int): Seq[BlockId] = {
     blockManager.master.getMatchingBlockIds( _ match {
       case ShuffleBlockId(`shuffleId`, _, _) => true
+      case ShuffleIndexBlockId(`shuffleId`, _, _) => true
       case _ => false
     }, askSlaves = true)
   }
diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
index de4bd90c8f7e5..a73e1ef0288a5 100644
--- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
@@ -34,7 +34,7 @@ import scala.language.postfixOps
 class DriverSuite extends FunSuite with Timeouts {
 
   test("driver should exit after finishing") {
-    val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.props.get("spark.home")).get
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     // Regression test for SPARK-530: "Spark driver process doesn't exit after finishing"
     val masters = Table(("master"), ("local"), ("local-cluster[2,1,512]"))
     forAll(masters) { (master: String) =>
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index e755d2e309398..2229e6acc425d 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -104,8 +104,9 @@ class FailureSuite extends FunSuite with LocalSparkContext {
       results.collect()
     }
     assert(thrown.getClass === classOf[SparkException])
-    assert(thrown.getMessage.contains("NotSerializableException") || 
-      thrown.getCause.getClass === classOf[NotSerializableException])
+    assert(thrown.getMessage.contains("serializable") ||
+      thrown.getCause.getClass === classOf[NotSerializableException],
+      "Exception does not contain \"serializable\": " + thrown.getMessage)
 
     FailureSuiteState.clear()
   }
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 070e974657860..4a53d25012ad9 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -24,12 +24,14 @@ import scala.io.Source
 import com.google.common.io.Files
 import org.apache.hadoop.io._
 import org.apache.hadoop.io.compress.DefaultCodec
-import org.apache.hadoop.mapred.{JobConf, FileAlreadyExistsException, TextOutputFormat}
-import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
+import org.apache.hadoop.mapred.{JobConf, FileAlreadyExistsException, FileSplit, TextInputFormat, TextOutputFormat}
 import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
+import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.{NewHadoopRDD, HadoopRDD}
 import org.apache.spark.util.Utils
 
 class FileSuite extends FunSuite with LocalSparkContext {
@@ -177,6 +179,31 @@ class FileSuite extends FunSuite with LocalSparkContext {
     assert(output.collect().toList === List((1, "a"), (2, "aa"), (3, "aaa")))
   }
 
+  test("object files of classes from a JAR") {
+    val original = Thread.currentThread().getContextClassLoader
+    val className = "FileSuiteObjectFileTest"
+    val jar = TestUtils.createJarWithClasses(Seq(className))
+    val loader = new java.net.URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader)
+    Thread.currentThread().setContextClassLoader(loader)
+    try {
+      sc = new SparkContext("local", "test")
+      val objs = sc.makeRDD(1 to 3).map { x =>
+        val loader = Thread.currentThread().getContextClassLoader
+        Class.forName(className, true, loader).newInstance()
+      }
+      val outputDir = new File(tempDir, "output").getAbsolutePath
+      objs.saveAsObjectFile(outputDir)
+      // Try reading the output back as an object file
+      val ct = reflect.ClassTag[Any](Class.forName(className, true, loader))
+      val output = sc.objectFile[Any](outputDir)
+      assert(output.collect().size === 3)
+      assert(output.collect().head.getClass.getName === className)
+    }
+    finally {
+      Thread.currentThread().setContextClassLoader(original)
+    }
+  }
+
   test("write SequenceFile using new Hadoop API") {
     import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
     sc = new SparkContext("local", "test")
@@ -293,4 +320,32 @@ class FileSuite extends FunSuite with LocalSparkContext {
     randomRDD.saveAsNewAPIHadoopDataset(job.getConfiguration)
     assert(new File(tempDir.getPath + "/outputDataset_new/part-r-00000").exists() === true)
   }
+
+  test("Get input files via old Hadoop API") {
+    sc = new SparkContext("local", "test")
+    val outDir = new File(tempDir, "output").getAbsolutePath
+    sc.makeRDD(1 to 4, 2).saveAsTextFile(outDir)
+
+    val inputPaths =
+      sc.hadoopFile(outDir, classOf[TextInputFormat], classOf[LongWritable], classOf[Text])
+        .asInstanceOf[HadoopRDD[_, _]]
+        .mapPartitionsWithInputSplit { (split, part) =>
+          Iterator(split.asInstanceOf[FileSplit].getPath.toUri.getPath)
+        }.collect()
+    assert(inputPaths.toSet === Set(s"$outDir/part-00000", s"$outDir/part-00001"))
+  }
+
+  test("Get input files via new Hadoop API") {
+    sc = new SparkContext("local", "test")
+    val outDir = new File(tempDir, "output").getAbsolutePath
+    sc.makeRDD(1 to 4, 2).saveAsTextFile(outDir)
+
+    val inputPaths =
+      sc.newAPIHadoopFile(outDir, classOf[NewTextInputFormat], classOf[LongWritable], classOf[Text])
+        .asInstanceOf[NewHadoopRDD[_, _]]
+        .mapPartitionsWithInputSplit { (split, part) =>
+          Iterator(split.asInstanceOf[NewFileSplit].getPath.toUri.getPath)
+        }.collect()
+    assert(inputPaths.toSet === Set(s"$outDir/part-00000", s"$outDir/part-00001"))
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 7c30626a0c421..fc0cee3e8749d 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark
 
+import scala.collection.mutable.ArrayBuffer
 import scala.math.abs
 
 import org.scalatest.{FunSuite, PrivateMethodTester}
@@ -52,14 +53,12 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
 
     assert(p2 === p2)
     assert(p4 === p4)
-    assert(p2 != p4)
-    assert(p4 != p2)
+    assert(p2 === p4)
     assert(p4 === anotherP4)
     assert(anotherP4 === p4)
     assert(descendingP2 === descendingP2)
     assert(descendingP4 === descendingP4)
-    assert(descendingP2 != descendingP4)
-    assert(descendingP4 != descendingP2)
+    assert(descendingP2 === descendingP4)
     assert(p2 != descendingP2)
     assert(p4 != descendingP4)
     assert(descendingP2 != p2)
@@ -91,6 +90,74 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
     }
   }
 
+  test("RangePartitioner for keys that are not Comparable (but with Ordering)") {
+    // Row does not extend Comparable, but has an implicit Ordering defined.
+    implicit object RowOrdering extends Ordering[Row] {
+      override def compare(x: Row, y: Row) = x.value - y.value
+    }
+
+    val rdd = sc.parallelize(1 to 4500).map(x => (Row(x), Row(x)))
+    val partitioner = new RangePartitioner(1500, rdd)
+    partitioner.getPartition(Row(100))
+  }
+
+  test("RangPartitioner.sketch") {
+    val rdd = sc.makeRDD(0 until 20, 20).flatMap { i =>
+      val random = new java.util.Random(i)
+      Iterator.fill(i)(random.nextDouble())
+    }.cache()
+    val sampleSizePerPartition = 10
+    val (count, sketched) = RangePartitioner.sketch(rdd, sampleSizePerPartition)
+    assert(count === rdd.count())
+    sketched.foreach { case (idx, n, sample) =>
+      assert(n === idx)
+      assert(sample.size === math.min(n, sampleSizePerPartition))
+    }
+  }
+
+  test("RangePartitioner.determineBounds") {
+    assert(RangePartitioner.determineBounds(ArrayBuffer.empty[(Int, Float)], 10).isEmpty,
+      "Bounds on an empty candidates set should be empty.")
+    val candidates = ArrayBuffer(
+      (0.7, 2.0f), (0.1, 1.0f), (0.4, 1.0f), (0.3, 1.0f), (0.2, 1.0f), (0.5, 1.0f), (1.0, 3.0f))
+    assert(RangePartitioner.determineBounds(candidates, 3) === Array(0.4, 0.7))
+  }
+
+  test("RangePartitioner should run only one job if data is roughly balanced") {
+    val rdd = sc.makeRDD(0 until 20, 20).flatMap { i =>
+      val random = new java.util.Random(i)
+      Iterator.fill(5000 * i)((random.nextDouble() + i, i))
+    }.cache()
+    for (numPartitions <- Seq(10, 20, 40)) {
+      val partitioner = new RangePartitioner(numPartitions, rdd)
+      assert(partitioner.numPartitions === numPartitions)
+      val counts = rdd.keys.map(key => partitioner.getPartition(key)).countByValue().values
+      assert(counts.max < 3.0 * counts.min)
+    }
+  }
+
+  test("RangePartitioner should work well on unbalanced data") {
+    val rdd = sc.makeRDD(0 until 20, 20).flatMap { i =>
+      val random = new java.util.Random(i)
+      Iterator.fill(20 * i * i * i)((random.nextDouble() + i, i))
+    }.cache()
+    for (numPartitions <- Seq(2, 4, 8)) {
+      val partitioner = new RangePartitioner(numPartitions, rdd)
+      assert(partitioner.numPartitions === numPartitions)
+      val counts = rdd.keys.map(key => partitioner.getPartition(key)).countByValue().values
+      assert(counts.max < 3.0 * counts.min)
+    }
+  }
+
+  test("RangePartitioner should return a single partition for empty RDDs") {
+    val empty1 = sc.emptyRDD[(Int, Double)]
+    val partitioner1 = new RangePartitioner(0, empty1)
+    assert(partitioner1.numPartitions === 1)
+    val empty2 = sc.makeRDD(0 until 2, 2).flatMap(i => Seq.empty[(Int, Double)])
+    val partitioner2 = new RangePartitioner(2, empty2)
+    assert(partitioner2.numPartitions === 1)
+  }
+
   test("HashPartitioner not equal to RangePartitioner") {
     val rdd = sc.parallelize(1 to 10).map(x => (x, x))
     val rangeP2 = new RangePartitioner(2, rdd)
@@ -177,3 +244,6 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
     // Add other tests here for classes that should be able to handle empty partitions correctly
   }
 }
+
+
+private sealed case class Row(value: Int)
diff --git a/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala b/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala
index 47df00050c1e2..d7b2d2e1e330f 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala
@@ -28,6 +28,6 @@ class ShuffleNettySuite extends ShuffleSuite with BeforeAndAfterAll {
   }
 
   override def afterAll() {
-    System.setProperty("spark.shuffle.use.netty", "false")
+    System.clearProperty("spark.shuffle.use.netty")
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index c4f2f7e34f4d5..b13ddf96bc77c 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -58,8 +58,7 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     // default Java serializer cannot handle the non serializable class.
     val c = new ShuffledRDD[Int,
       NonJavaSerializableClass,
-      NonJavaSerializableClass,
-      (Int, NonJavaSerializableClass)](b, new HashPartitioner(NUM_BLOCKS))
+      NonJavaSerializableClass](b, new HashPartitioner(NUM_BLOCKS))
     c.setSerializer(new KryoSerializer(conf))
     val shuffleId = c.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId
 
@@ -83,8 +82,7 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     // default Java serializer cannot handle the non serializable class.
     val c = new ShuffledRDD[Int,
       NonJavaSerializableClass,
-      NonJavaSerializableClass,
-      (Int, NonJavaSerializableClass)](b, new HashPartitioner(3))
+      NonJavaSerializableClass](b, new HashPartitioner(3))
     c.setSerializer(new KryoSerializer(conf))
     assert(c.count === 10)
   }
@@ -100,7 +98,7 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
 
     // NOTE: The default Java serializer doesn't create zero-sized blocks.
     //       So, use Kryo
-    val c = new ShuffledRDD[Int, Int, Int, (Int, Int)](b, new HashPartitioner(10))
+    val c = new ShuffledRDD[Int, Int, Int](b, new HashPartitioner(10))
       .setSerializer(new KryoSerializer(conf))
 
     val shuffleId = c.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId
@@ -126,7 +124,7 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     val b = a.map(x => (x, x*2))
 
     // NOTE: The default Java serializer should create zero-sized blocks
-    val c = new ShuffledRDD[Int, Int, Int, (Int, Int)](b, new HashPartitioner(10))
+    val c = new ShuffledRDD[Int, Int, Int](b, new HashPartitioner(10))
 
     val shuffleId = c.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId
     assert(c.count === 4)
@@ -141,19 +139,19 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     assert(nonEmptyBlocks.size <= 4)
   }
 
-  test("shuffle using mutable pairs") {
+  test("shuffle on mutable pairs") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
     sc = new SparkContext("local-cluster[2,1,512]", "test")
     def p[T1, T2](_1: T1, _2: T2) = MutablePair(_1, _2)
     val data = Array(p(1, 1), p(1, 2), p(1, 3), p(2, 1))
     val pairs: RDD[MutablePair[Int, Int]] = sc.parallelize(data, 2)
-    val results = new ShuffledRDD[Int, Int, Int, MutablePair[Int, Int]](pairs,
+    val results = new ShuffledRDD[Int, Int, Int](pairs,
       new HashPartitioner(2)).collect()
 
-    data.foreach { pair => results should contain (pair) }
+    data.foreach { pair => results should contain ((pair._1, pair._2)) }
   }
 
-  test("sorting using mutable pairs") {
+  test("sorting on mutable pairs") {
     // This is not in SortingSuite because of the local cluster setup.
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
     sc = new SparkContext("local-cluster[2,1,512]", "test")
@@ -162,10 +160,10 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     val pairs: RDD[MutablePair[Int, Int]] = sc.parallelize(data, 2)
     val results = new OrderedRDDFunctions[Int, Int, MutablePair[Int, Int]](pairs)
       .sortByKey().collect()
-    results(0) should be (p(1, 11))
-    results(1) should be (p(2, 22))
-    results(2) should be (p(3, 33))
-    results(3) should be (p(100, 100))
+    results(0) should be ((1, 11))
+    results(1) should be ((2, 22))
+    results(2) should be ((3, 33))
+    results(3) should be ((100, 100))
   }
 
   test("cogroup using mutable pairs") {
@@ -176,7 +174,9 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     val data2 = Seq(p(1, "11"), p(1, "12"), p(2, "22"), p(3, "3"))
     val pairs1: RDD[MutablePair[Int, Int]] = sc.parallelize(data1, 2)
     val pairs2: RDD[MutablePair[Int, String]] = sc.parallelize(data2, 2)
-    val results = new CoGroupedRDD[Int](Seq(pairs1, pairs2), new HashPartitioner(2)).collectAsMap()
+    val results = new CoGroupedRDD[Int](Seq(pairs1, pairs2), new HashPartitioner(2))
+      .map(p => (p._1, p._2.map(_.toArray)))
+      .collectAsMap()
 
     assert(results(1)(0).length === 3)
     assert(results(1)(0).contains(1))
@@ -240,7 +240,7 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext {
     }
 
     assert(thrown.getClass === classOf[SparkException])
-    assert(thrown.getMessage.contains("NotSerializableException"))
+    assert(thrown.getMessage.toLowerCase.contains("serializable"))
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
similarity index 60%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala
rename to core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
index 4589129cd1c90..5c02c00586ef4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/package.scala
+++ b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala
@@ -17,20 +17,18 @@
 
 package org.apache.spark
 
-/**
- * Allows the execution of relational queries, including those expressed in SQL using Spark.
- *
- * Note that this package is located in catalyst instead of in core so that all subprojects can
- * inherit the settings from this package object.
- */
-package object sql {
+import org.scalatest.BeforeAndAfterAll
 
-  protected[sql] def Logger(name: String) =
-    com.typesafe.scalalogging.slf4j.Logger(org.slf4j.LoggerFactory.getLogger(name))
+class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
 
-  protected[sql] type Logging = com.typesafe.scalalogging.slf4j.Logging
+  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.
 
-  type Row = catalyst.expressions.Row
+  override def beforeAll() {
+    System.setProperty("spark.shuffle.manager",
+      "org.apache.spark.shuffle.sort.SortShuffleManager")
+  }
 
-  val Row = catalyst.expressions.Row
+  override def afterAll() {
+    System.clearProperty("spark.shuffle.manager")
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala
index 1fde4badda949..e6ab538d77bcc 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import org.scalatest.{Assertions, FunSuite}
+import org.apache.spark.storage.StorageLevel
 
 class SparkContextInfoSuite extends FunSuite with LocalSparkContext {
   test("getPersistentRDDs only returns RDDs that are marked as cached") {
@@ -35,26 +36,33 @@ class SparkContextInfoSuite extends FunSuite with LocalSparkContext {
   test("getPersistentRDDs returns an immutable map") {
     sc = new SparkContext("local", "test")
     val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).cache()
-
     val myRdds = sc.getPersistentRDDs
     assert(myRdds.size === 1)
-    assert(myRdds.values.head === rdd1)
+    assert(myRdds(0) === rdd1)
+    assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY)
 
+    // myRdds2 should have 2 RDDs, but myRdds should not change
     val rdd2 = sc.makeRDD(Array(5, 6, 7, 8), 1).cache()
-
-    // getPersistentRDDs should have 2 RDDs, but myRdds should not change
-    assert(sc.getPersistentRDDs.size === 2)
+    val myRdds2 = sc.getPersistentRDDs
+    assert(myRdds2.size === 2)
+    assert(myRdds2(0) === rdd1)
+    assert(myRdds2(1) === rdd2)
+    assert(myRdds2(0).getStorageLevel === StorageLevel.MEMORY_ONLY)
+    assert(myRdds2(1).getStorageLevel === StorageLevel.MEMORY_ONLY)
     assert(myRdds.size === 1)
+    assert(myRdds(0) === rdd1)
+    assert(myRdds(0).getStorageLevel === StorageLevel.MEMORY_ONLY)
   }
 
   test("getRDDStorageInfo only reports on RDDs that actually persist data") {
     sc = new SparkContext("local", "test")
     val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache()
-
     assert(sc.getRDDStorageInfo.size === 0)
-
     rdd.collect()
     assert(sc.getRDDStorageInfo.size === 1)
+    assert(sc.getRDDStorageInfo.head.isCached)
+    assert(sc.getRDDStorageInfo.head.memSize > 0)
+    assert(sc.getRDDStorageInfo.head.storageLevel === StorageLevel.MEMORY_ONLY)
   }
 
   test("call sites report correct locations") {
@@ -70,7 +78,7 @@ package object testPackage extends Assertions {
   def runCallSiteTest(sc: SparkContext) {
     val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2)
     val rddCreationSite = rdd.getCreationSite
-    val curCallSite = sc.getCallSite().short // note: 2 lines after definition of "rdd"
+    val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd"
 
     val rddCreationLine = rddCreationSite match {
       case CALL_SITE_REGEX(func, file, line) => {
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
index 67e3be21c3c93..495a0d48633a4 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark
 
-import org.scalatest.{FunSuite, PrivateMethodTester}
+import org.scalatest.{BeforeAndAfterEach, FunSuite, PrivateMethodTester}
 
 import org.apache.spark.scheduler.{TaskScheduler, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.{SimrSchedulerBackend, SparkDeploySchedulerBackend}
@@ -25,12 +25,12 @@ import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, Me
 import org.apache.spark.scheduler.local.LocalBackend
 
 class SparkContextSchedulerCreationSuite
-  extends FunSuite with PrivateMethodTester with LocalSparkContext with Logging {
+  extends FunSuite with PrivateMethodTester with Logging with BeforeAndAfterEach {
 
   def createTaskScheduler(master: String): TaskSchedulerImpl = {
     // Create local SparkContext to setup a SparkEnv. We don't actually want to start() the
     // real schedulers, so we don't want to create a full SparkContext with the desired scheduler.
-    sc = new SparkContext("local", "test")
+    val sc = new SparkContext("local", "test")
     val createTaskSchedulerMethod = PrivateMethod[TaskScheduler]('createTaskScheduler)
     val sched = SparkContext invokePrivate createTaskSchedulerMethod(sc, master)
     sched.asInstanceOf[TaskSchedulerImpl]
@@ -68,6 +68,15 @@ class SparkContextSchedulerCreationSuite
     }
   }
 
+  test("local-*-n-failures") {
+    val sched = createTaskScheduler("local[* ,2]")
+    assert(sched.maxTaskFailures === 2)
+    sched.backend match {
+      case s: LocalBackend => assert(s.totalCores === Runtime.getRuntime.availableProcessors())
+      case _ => fail()
+    }
+  }
+
   test("local-n-failures") {
     val sched = createTaskScheduler("local[4, 2]")
     assert(sched.maxTaskFailures === 2)
@@ -77,6 +86,20 @@ class SparkContextSchedulerCreationSuite
     }
   }
 
+  test("bad-local-n") {
+    val e = intercept[SparkException] {
+      createTaskScheduler("local[2*]")
+    }
+    assert(e.getMessage.contains("Could not parse Master URL"))
+  }
+
+  test("bad-local-n-failures") {
+    val e = intercept[SparkException] {
+      createTaskScheduler("local[2*,4]")
+    }
+    assert(e.getMessage.contains("Could not parse Master URL"))
+  }
+
   test("local-default-parallelism") {
     val defaultParallelism = System.getProperty("spark.default.parallelism")
     System.setProperty("spark.default.parallelism", "16")
diff --git a/core/src/test/scala/org/apache/spark/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
similarity index 98%
rename from core/src/test/scala/org/apache/spark/BroadcastSuite.scala
rename to core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index c9936256a5b95..7c3d0208b195a 100644
--- a/core/src/test/scala/org/apache/spark/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -15,14 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark
+package org.apache.spark.broadcast
 
+import org.apache.spark.storage.{BroadcastBlockId, _}
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException}
 import org.scalatest.FunSuite
 
-import org.apache.spark.storage._
-import org.apache.spark.broadcast.{Broadcast, HttpBroadcast}
-import org.apache.spark.storage.BroadcastBlockId
-
 class BroadcastSuite extends FunSuite with LocalSparkContext {
 
   private val httpConf = broadcastConf("HttpBroadcastFactory")
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index 01ab2d549325c..31aa7ec837f43 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -88,8 +88,8 @@ class JsonProtocolSuite extends FunSuite {
   }
 
   def createAppDesc(): ApplicationDescription = {
-    val cmd = new Command("mainClass", List("arg1", "arg2"), Map(), Seq(), Seq())
-    new ApplicationDescription("name", Some(4), 1234, cmd, Some("sparkHome"), "appUiUrl")
+    val cmd = new Command("mainClass", List("arg1", "arg2"), Map(), Seq(), Seq(), Seq())
+    new ApplicationDescription("name", Some(4), 1234, cmd, "appUiUrl")
   }
 
   def createAppInfo() : ApplicationInfo = {
@@ -101,7 +101,7 @@ class JsonProtocolSuite extends FunSuite {
 
   def createDriverCommand() = new Command(
     "org.apache.spark.FakeClass", Seq("some arg --and-some options -g foo"),
-    Map(("K1", "V1"), ("K2", "V2")), Seq("cp1", "cp2"), Seq("lp1", "lp2"), Some("-Dfoo")
+    Map(("K1", "V1"), ("K2", "V2")), Seq("cp1", "cp2"), Seq("lp1", "lp2"), Seq("-Dfoo")
   )
 
   def createDriverDesc() = new DriverDescription("hdfs://some-dir/some.jar", 100, 3,
@@ -169,8 +169,7 @@ object JsonConstants {
   val appDescJsonStr =
     """
       |{"name":"name","cores":4,"memoryperslave":1234,
-      |"user":"%s","sparkhome":"sparkHome",
-      |"command":"Command(mainClass,List(arg1, arg2),Map(),List(),List(),None)"}
+      |"user":"%s","command":"Command(mainClass,List(arg1, arg2),Map(),List(),List(),List())"}
     """.format(System.getProperty("user.name", "<unknown>")).stripMargin
 
   val executorRunnerJsonStr =
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 565c53e9529ff..a5cdcfb5de03b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -120,6 +120,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "beauty",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -139,6 +140,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
     classpath should have length (0)
     sysProps("spark.app.name") should be ("beauty")
+    sysProps("spark.shuffle.spill") should be ("false")
     sysProps("SPARK_SUBMIT") should be ("true")
   }
 
@@ -156,6 +158,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "trill",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -176,6 +179,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
     sysProps("spark.jars") should include regex (".*one.jar,.*two.jar,.*three.jar,.*thejar.jar")
     sysProps("SPARK_SUBMIT") should be ("true")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("handles standalone cluster mode") {
@@ -186,6 +190,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--supervise",
       "--driver-memory", "4g",
       "--driver-cores", "5",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -195,9 +200,13 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     childArgsStr should include regex ("launch spark://h:p .*thejar.jar org.SomeClass arg1 arg2")
     mainClass should be ("org.apache.spark.deploy.Client")
     classpath should have size (0)
-    sysProps should have size (2)
-    sysProps.keys should contain ("spark.jars")
+    sysProps should have size (5)
     sysProps.keys should contain ("SPARK_SUBMIT")
+    sysProps.keys should contain ("spark.master")
+    sysProps.keys should contain ("spark.app.name")
+    sysProps.keys should contain ("spark.jars")
+    sysProps.keys should contain ("spark.shuffle.spill")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("handles standalone client mode") {
@@ -208,6 +217,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -218,6 +228,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
+    sysProps("spark.shuffle.spill") should be ("false")
   }
 
   test("handles mesos client mode") {
@@ -228,6 +239,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
+      "--conf", "spark.shuffle.spill=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -238,6 +250,23 @@ class SparkSubmitSuite extends FunSuite with Matchers {
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
+    sysProps("spark.shuffle.spill") should be ("false")
+  }
+
+  test("handles confs with flag equivalents") {
+    val clArgs = Seq(
+      "--deploy-mode", "cluster",
+      "--executor-memory", "5g",
+      "--class", "org.SomeClass",
+      "--conf", "spark.executor.memory=4g",
+      "--conf", "spark.master=yarn",
+      "thejar.jar",
+      "arg1", "arg2")
+    val appArgs = new SparkSubmitArguments(clArgs)
+    val (_, _, sysProps, mainClass) = createLaunchEnv(appArgs)
+    sysProps("spark.executor.memory") should be ("5g")
+    sysProps("spark.master") should be ("yarn-cluster")
+    mainClass should be ("org.apache.spark.deploy.yarn.Client")
   }
 
   test("launch simple application with spark-submit") {
@@ -266,7 +295,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
 
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   def runSparkSubmit(args: Seq[String]): String = {
-    val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.props.get("spark.home")).get
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     Utils.executeAndGetOutput(
       Seq("./bin/spark-submit") ++ args,
       new File(sparkHome),
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
index 4633bc3f7f25e..c930839b47f11 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
@@ -29,7 +29,7 @@ import org.apache.spark.deploy.{Command, DriverDescription}
 
 class DriverRunnerTest extends FunSuite {
   private def createDriverRunner() = {
-    val command = new Command("mainClass", Seq(), Map(), Seq(), Seq())
+    val command = new Command("mainClass", Seq(), Map(), Seq(), Seq(), Seq())
     val driverDescription = new DriverDescription("jarUrl", 512, 1, true, command)
     new DriverRunner("driverId", new File("workDir"), new File("sparkHome"), driverDescription,
       null, "akka://1.2.3.4/worker/")
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index e5f748d55500d..39ab53cf0b5b1 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -27,12 +27,11 @@ import org.apache.spark.SparkConf
 class ExecutorRunnerTest extends FunSuite {
   test("command includes appId") {
     def f(s:String) = new File(s)
-    val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.props.get("spark.home"))
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     val appDesc = new ApplicationDescription("app name", Some(8), 500,
-      Command("foo", Seq(), Map(), Seq(), Seq()),
-      sparkHome, "appUiUrl")
+      Command("foo", Seq(), Map(), Seq(), Seq(), Seq()), "appUiUrl")
     val appId = "12345-worker321-9876"
-    val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "worker321", f(sparkHome.getOrElse(".")),
+    val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "worker321", f(sparkHome),
       f("ooga"), "blah", new SparkConf, ExecutorState.RUNNING)
 
     assert(er.getCommandSeq.last === appId)
diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
index 68a0ea36aa545..3f882a724b047 100644
--- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
@@ -46,7 +46,13 @@ class CompressionCodecSuite extends FunSuite {
 
   test("default compression codec") {
     val codec = CompressionCodec.createCodec(conf)
-    assert(codec.getClass === classOf[LZFCompressionCodec])
+    assert(codec.getClass === classOf[SnappyCompressionCodec])
+    testCodec(codec)
+  }
+
+  test("lz4 compression codec") {
+    val codec = CompressionCodec.createCodec(conf, classOf[LZ4CompressionCodec].getName)
+    assert(codec.getClass === classOf[LZ4CompressionCodec])
     testCodec(codec)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ConnectionManagerSuite.scala b/core/src/test/scala/org/apache/spark/network/ConnectionManagerSuite.scala
similarity index 97%
rename from core/src/test/scala/org/apache/spark/ConnectionManagerSuite.scala
rename to core/src/test/scala/org/apache/spark/network/ConnectionManagerSuite.scala
index df6b2604c8d8a..415ad8c432c12 100644
--- a/core/src/test/scala/org/apache/spark/ConnectionManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/ConnectionManagerSuite.scala
@@ -15,15 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark
-
-import org.scalatest.FunSuite
+package org.apache.spark.network
 
 import java.nio._
 
-import org.apache.spark.network.{ConnectionManager, Message, ConnectionManagerId}
-import scala.concurrent.Await
-import scala.concurrent.TimeoutException
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.scalatest.FunSuite
+
+import scala.concurrent.{Await, TimeoutException}
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 447e38ec9dbd0..4f49d4a1d4d34 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -83,6 +83,122 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
     assert(valuesFor2.toList.sorted === List(1))
   }
 
+  test("sampleByKey") {
+    def stratifier (fractionPositive: Double) = {
+      (x: Int) => if (x % 10 < (10 * fractionPositive).toInt) "1" else "0"
+    }
+
+    def checkSize(exact: Boolean,
+        withReplacement: Boolean,
+        expected: Long,
+        actual: Long,
+        p: Double): Boolean = {
+      if (exact) {
+        return expected == actual
+      }
+      val stdev = if (withReplacement) math.sqrt(expected) else math.sqrt(expected * p * (1 - p))
+      // Very forgiving margin since we're dealing with very small sample sizes most of the time
+      math.abs(actual - expected) <= 6 * stdev
+    }
+
+    // Without replacement validation
+    def takeSampleAndValidateBernoulli(stratifiedData: RDD[(String, Int)],
+        exact: Boolean,
+        samplingRate: Double,
+        seed: Long,
+        n: Long) = {
+      val expectedSampleSize = stratifiedData.countByKey()
+        .mapValues(count => math.ceil(count * samplingRate).toInt)
+      val fractions = Map("1" -> samplingRate, "0" -> samplingRate)
+      val sample = stratifiedData.sampleByKey(false, fractions, exact, seed)
+      val sampleCounts = sample.countByKey()
+      val takeSample = sample.collect()
+      sampleCounts.foreach { case(k, v) =>
+        assert(checkSize(exact, false, expectedSampleSize(k), v, samplingRate)) }
+      assert(takeSample.size === takeSample.toSet.size)
+      takeSample.foreach { x => assert(1 <= x._2 && x._2 <= n, s"elements not in [1, $n]") }
+    }
+
+    // With replacement validation
+    def takeSampleAndValidatePoisson(stratifiedData: RDD[(String, Int)],
+        exact: Boolean,
+        samplingRate: Double,
+        seed: Long,
+        n: Long) = {
+      val expectedSampleSize = stratifiedData.countByKey().mapValues(count =>
+        math.ceil(count * samplingRate).toInt)
+      val fractions = Map("1" -> samplingRate, "0" -> samplingRate)
+      val sample = stratifiedData.sampleByKey(true, fractions, exact, seed)
+      val sampleCounts = sample.countByKey()
+      val takeSample = sample.collect()
+      sampleCounts.foreach { case(k, v) =>
+        assert(checkSize(exact, true, expectedSampleSize(k), v, samplingRate)) }
+      val groupedByKey = takeSample.groupBy(_._1)
+      for ((key, v) <- groupedByKey) {
+        if (expectedSampleSize(key) >= 100 && samplingRate >= 0.1) {
+          // sample large enough for there to be repeats with high likelihood
+          assert(v.toSet.size < expectedSampleSize(key))
+        } else {
+          if (exact) {
+            assert(v.toSet.size <= expectedSampleSize(key))
+          } else {
+            assert(checkSize(false, true, expectedSampleSize(key), v.toSet.size, samplingRate))
+          }
+        }
+      }
+      takeSample.foreach { x => assert(1 <= x._2 && x._2 <= n, s"elements not in [1, $n]") }
+    }
+
+    def checkAllCombos(stratifiedData: RDD[(String, Int)],
+        samplingRate: Double,
+        seed: Long,
+        n: Long) = {
+      takeSampleAndValidateBernoulli(stratifiedData, true, samplingRate, seed, n)
+      takeSampleAndValidateBernoulli(stratifiedData, false, samplingRate, seed, n)
+      takeSampleAndValidatePoisson(stratifiedData, true, samplingRate, seed, n)
+      takeSampleAndValidatePoisson(stratifiedData, false, samplingRate, seed, n)
+    }
+
+    val defaultSeed = 1L
+
+    // vary RDD size
+    for (n <- List(100, 1000, 1000000)) {
+      val data = sc.parallelize(1 to n, 2)
+      val fractionPositive = 0.3
+      val stratifiedData = data.keyBy(stratifier(fractionPositive))
+
+      val samplingRate = 0.1
+      checkAllCombos(stratifiedData, samplingRate, defaultSeed, n)
+    }
+
+    // vary fractionPositive
+    for (fractionPositive <- List(0.1, 0.3, 0.5, 0.7, 0.9)) {
+      val n = 100
+      val data = sc.parallelize(1 to n, 2)
+      val stratifiedData = data.keyBy(stratifier(fractionPositive))
+
+      val samplingRate = 0.1
+      checkAllCombos(stratifiedData, samplingRate, defaultSeed, n)
+    }
+
+    // Use the same data for the rest of the tests
+    val fractionPositive = 0.3
+    val n = 100
+    val data = sc.parallelize(1 to n, 2)
+    val stratifiedData = data.keyBy(stratifier(fractionPositive))
+
+    // vary seed
+    for (seed <- defaultSeed to defaultSeed + 5L) {
+      val samplingRate = 0.1
+      checkAllCombos(stratifiedData, samplingRate, seed, n)
+    }
+
+    // vary sampling rate
+    for (samplingRate <- List(0.01, 0.05, 0.1, 0.5)) {
+      checkAllCombos(stratifiedData, samplingRate, defaultSeed, n)
+    }
+  }
+
   test("reduceByKey") {
     val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1)))
     val sums = pairs.reduceByKey(_+_).collect()
diff --git a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
index 5dd8de319a654..a0483886f8db3 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
@@ -43,7 +43,7 @@ class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {
   test("seed distribution") {
     val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
     val sampler = new MockSampler
-    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, 0L)
+    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
     assert(sample.distinct().count == 2, "Seeds must be different.")
   }
 
@@ -52,7 +52,7 @@ class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {
     // We want to make sure there are no concurrency issues.
     val rdd = sc.parallelize(0 until 111, 10)
     for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
-      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler)
+      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
       sampled.zip(sampled).count()
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
similarity index 95%
rename from core/src/test/scala/org/apache/spark/PipedRDDSuite.scala
rename to core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
index db56a4acdd6f5..be972c5e97a7e 100644
--- a/core/src/test/scala/org/apache/spark/PipedRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
@@ -15,25 +15,21 @@
  * limitations under the License.
  */
 
-package org.apache.spark
+package org.apache.spark.rdd
 
 import java.io.File
 
-import org.scalatest.FunSuite
-
-import org.apache.spark.rdd.{HadoopRDD, PipedRDD, HadoopPartition}
-import org.apache.hadoop.mapred.{JobConf, TextInputFormat, FileSplit}
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.{LongWritable, Text}
+import org.apache.hadoop.mapred.{FileSplit, JobConf, TextInputFormat}
+import org.apache.spark._
+import org.scalatest.FunSuite
 
 import scala.collection.Map
 import scala.language.postfixOps
 import scala.sys.process._
 import scala.util.Try
 
-import org.apache.hadoop.io.{Text, LongWritable}
-
-import org.apache.spark.executor.TaskMetrics
-
 class PipedRDDSuite extends FunSuite with SharedSparkContext {
 
   test("basic pipe") {
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index 0f9cbe213ea17..b31e3a09e5b9c 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.rdd
 
 import scala.collection.mutable.{ArrayBuffer, HashMap}
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.scalatest.FunSuite
@@ -26,6 +27,7 @@ import org.apache.spark._
 import org.apache.spark.SparkContext._
 import org.apache.spark.util.Utils
 
+import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDDSuiteUtils._
 
 class RDDSuite extends FunSuite with SharedSparkContext {
@@ -121,6 +123,18 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(union.partitioner === nums1.partitioner)
   }
 
+  test("UnionRDD partition serialized size should be small") {
+    val largeVariable = new Array[Byte](1000 * 1000)
+    val rdd1 = sc.parallelize(1 to 10, 2).map(i => largeVariable.length)
+    val rdd2 = sc.parallelize(1 to 10, 3)
+
+    val ser = SparkEnv.get.closureSerializer.newInstance()
+    val union = rdd1.union(rdd2)
+    // The UnionRDD itself should be large, but each individual partition should be small.
+    assert(ser.serialize(union).limit() > 2000)
+    assert(ser.serialize(union.partitions.head).limit() < 2000)
+  }
+
   test("aggregate") {
     val pairs = sc.makeRDD(Array(("a", 1), ("b", 2), ("a", 2), ("c", 5), ("a", 3)))
     type StringMap = HashMap[String, Int]
@@ -155,19 +169,13 @@ class RDDSuite extends FunSuite with SharedSparkContext {
       override def getPartitions: Array[Partition] = Array(onlySplit)
       override val getDependencies = List[Dependency[_]]()
       override def compute(split: Partition, context: TaskContext): Iterator[Int] = {
-        if (shouldFail) {
-          throw new Exception("injected failure")
-        } else {
-          Array(1, 2, 3, 4).iterator
-        }
+        throw new Exception("injected failure")
       }
     }.cache()
     val thrown = intercept[Exception]{
       rdd.collect()
     }
     assert(thrown.getMessage.contains("injected failure"))
-    shouldFail = false
-    assert(rdd.collect().toList === List(1, 2, 3, 4))
   }
 
   test("empty RDD") {
@@ -276,7 +284,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     // we can optionally shuffle to keep the upstream parallel
     val coalesced5 = data.coalesce(1, shuffle = true)
     val isEquals = coalesced5.dependencies.head.rdd.dependencies.head.rdd.
-      asInstanceOf[ShuffledRDD[_, _, _, _]] != null
+      asInstanceOf[ShuffledRDD[_, _, _]] != null
     assert(isEquals)
 
     // when shuffling, we can increase the number of partitions
@@ -351,6 +359,20 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     }
   }
 
+  // Test for SPARK-2412 -- ensure that the second pass of the algorithm does not throw an exception
+  test("coalesced RDDs with locality, fail first pass") {
+    val initialPartitions = 1000
+    val targetLen = 50
+    val couponCount = 2 * (math.log(targetLen)*targetLen + targetLen + 0.5).toInt // = 492
+
+    val blocks = (1 to initialPartitions).map { i =>
+      (i, List(if (i > couponCount) "m2" else "m1"))
+    }
+    val data = sc.makeRDD(blocks)
+    val coalesced = data.coalesce(targetLen)
+    assert(coalesced.partitions.length == targetLen)
+  }
+
   test("zipped RDDs") {
     val nums = sc.makeRDD(Array(1, 2, 3, 4), 2)
     val zipped = nums.zip(nums.map(_ + 1.0))
@@ -379,6 +401,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   test("mapWith") {
     import java.util.Random
     val ones = sc.makeRDD(Array(1, 1, 1, 1, 1, 1), 2)
+    @deprecated("suppress compile time deprecation warning", "1.0.0")
     val randoms = ones.mapWith(
       (index: Int) => new Random(index + 42))
       {(t: Int, prng: Random) => prng.nextDouble * t}.collect()
@@ -397,6 +420,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   test("flatMapWith") {
     import java.util.Random
     val ones = sc.makeRDD(Array(1, 1, 1, 1, 1, 1), 2)
+    @deprecated("suppress compile time deprecation warning", "1.0.0")
     val randoms = ones.flatMapWith(
       (index: Int) => new Random(index + 42))
       {(t: Int, prng: Random) =>
@@ -418,6 +442,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
   test("filterWith") {
     import java.util.Random
     val ints = sc.makeRDD(Array(1, 2, 3, 4, 5, 6), 2)
+    @deprecated("suppress compile time deprecation warning", "1.0.0")
     val sample = ints.filterWith(
       (index: Int) => new Random(index + 42))
       {(t: Int, prng: Random) => prng.nextInt(3) == 0}.
@@ -506,6 +531,15 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(sortedTopK === nums.sorted(ord).take(5))
   }
 
+  test("sample preserves partitioner") {
+    val partitioner = new HashPartitioner(2)
+    val rdd = sc.parallelize(Seq((0, 1), (2, 3))).partitionBy(partitioner)
+    for (withReplacement <- Seq(true, false)) {
+      val sampled = rdd.sample(withReplacement, 1.0)
+      assert(sampled.partitioner === rdd.partitioner)
+    }
+  }
+
   test("takeSample") {
     val n = 1000000
     val data = sc.parallelize(1 to n, 2)
@@ -587,6 +621,11 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     }
   }
 
+  test("sort an empty RDD") {
+    val data = sc.emptyRDD[Int]
+    assert(data.sortBy(x => x).collect() === Array.empty)
+  }
+
   test("sortByKey") {
     val data = sc.parallelize(Seq("5|50|A","4|60|C", "6|40|B"))
 
@@ -681,6 +720,12 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(ids.length === n)
   }
 
+  test("retag with implicit ClassTag") {
+    val jsc: JavaSparkContext = new JavaSparkContext(sc)
+    val jrdd: JavaRDD[String] = jsc.parallelize(Seq("A", "B", "C").asJava)
+    jrdd.rdd.retag.collect()
+  }
+
   test("getNarrowAncestors") {
     val rdd1 = sc.parallelize(1 to 100, 4)
     val rdd2 = rdd1.filter(_ % 2 == 0).map(_ + 1)
@@ -705,9 +750,9 @@ class RDDSuite extends FunSuite with SharedSparkContext {
 
     // Any ancestors before the shuffle are not considered
     assert(ancestors4.size === 0)
-    assert(ancestors4.count(_.isInstanceOf[ShuffledRDD[_, _, _, _]]) === 0)
+    assert(ancestors4.count(_.isInstanceOf[ShuffledRDD[_, _, _]]) === 0)
     assert(ancestors5.size === 3)
-    assert(ancestors5.count(_.isInstanceOf[ShuffledRDD[_, _, _, _]]) === 1)
+    assert(ancestors5.count(_.isInstanceOf[ShuffledRDD[_, _, _]]) === 1)
     assert(ancestors5.count(_.isInstanceOf[MapPartitionsRDD[_, _]]) === 0)
     assert(ancestors5.count(_.isInstanceOf[MappedValuesRDD[_, _, _]]) === 2)
   }
diff --git a/core/src/test/scala/org/apache/spark/ZippedPartitionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
similarity index 95%
rename from core/src/test/scala/org/apache/spark/ZippedPartitionsSuite.scala
rename to core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
index 4f87fd8654c4a..72596e86865b2 100644
--- a/core/src/test/scala/org/apache/spark/ZippedPartitionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark
+package org.apache.spark.rdd
 
+import org.apache.spark.SharedSparkContext
 import org.scalatest.FunSuite
 
 object ZippedPartitionsSuite {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 9f498d579a095..36e238b4c9434 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -23,12 +23,15 @@ import scala.language.reflectiveCalls
 import akka.actor._
 import akka.testkit.{ImplicitSender, TestKit, TestActorRef}
 import org.scalatest.{BeforeAndAfter, FunSuiteLike}
+import org.scalatest.concurrent.Timeouts
+import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster}
 import org.apache.spark.util.CallSite
+import org.apache.spark.executor.TaskMetrics
 
 class BuggyDAGEventProcessActor extends Actor {
   val state = 0
@@ -37,10 +40,33 @@ class BuggyDAGEventProcessActor extends Actor {
   }
 }
 
+/**
+ * An RDD for passing to DAGScheduler. These RDDs will use the dependencies and
+ * preferredLocations (if any) that are passed to them. They are deliberately not executable
+ * so we can test that DAGScheduler does not try to execute RDDs locally.
+ */
+class MyRDD(
+    sc: SparkContext,
+    numPartitions: Int,
+    dependencies: List[Dependency[_]],
+    locations: Seq[Seq[String]] = Nil) extends RDD[(Int, Int)](sc, dependencies) with Serializable {
+  override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
+    throw new RuntimeException("should not be reached")
+  override def getPartitions = (0 until numPartitions).map(i => new Partition {
+    override def index = i
+  }).toArray
+  override def getPreferredLocations(split: Partition): Seq[String] =
+    if (locations.isDefinedAt(split.index))
+      locations(split.index)
+    else
+      Nil
+  override def toString: String = "DAGSchedulerSuiteRDD " + id
+}
+
 class DAGSchedulerSuiteDummyException extends Exception
 
 class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with FunSuiteLike
-  with ImplicitSender with BeforeAndAfter with LocalSparkContext {
+  with ImplicitSender with BeforeAndAfter with LocalSparkContext with Timeouts {
 
   val conf = new SparkConf
   /** Set of TaskSets the DAGScheduler has requested executed. */
@@ -54,6 +80,8 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
     override def schedulingMode: SchedulingMode = SchedulingMode.NONE
     override def start() = {}
     override def stop() = {}
+    override def executorHeartbeatReceived(execId: String, taskMetrics: Array[(Long, TaskMetrics)],
+      blockManagerId: BlockManagerId): Boolean = true
     override def submitTasks(taskSet: TaskSet) = {
       // normally done by TaskSetManager
       taskSet.tasks.foreach(_.epoch = mapOutputTracker.getEpoch)
@@ -148,34 +176,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
    * Type of RDD we use for testing. Note that we should never call the real RDD compute methods.
    * This is a pair RDD type so it can always be used in ShuffleDependencies.
    */
-  type MyRDD = RDD[(Int, Int)]
-
-  /**
-   * Create an RDD for passing to DAGScheduler. These RDDs will use the dependencies and
-   * preferredLocations (if any) that are passed to them. They are deliberately not executable
-   * so we can test that DAGScheduler does not try to execute RDDs locally.
-   */
-  private def makeRdd(
-        numPartitions: Int,
-        dependencies: List[Dependency[_]],
-        locations: Seq[Seq[String]] = Nil
-      ): MyRDD = {
-    val maxPartition = numPartitions - 1
-    val newRDD = new MyRDD(sc, dependencies) {
-      override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
-        throw new RuntimeException("should not be reached")
-      override def getPartitions = (0 to maxPartition).map(i => new Partition {
-        override def index = i
-      }).toArray
-      override def getPreferredLocations(split: Partition): Seq[String] =
-        if (locations.isDefinedAt(split.index))
-          locations(split.index)
-        else
-          Nil
-      override def toString: String = "DAGSchedulerSuiteRDD " + id
-    }
-    newRDD
-  }
+  type PairOfIntsRDD = RDD[(Int, Int)]
 
   /**
    * Process the supplied event as if it were the top of the DAGScheduler event queue, expecting
@@ -234,19 +235,19 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
       override def taskSucceeded(partition: Int, value: Any) = numResults += 1
       override def jobFailed(exception: Exception) = throw exception
     }
-    submit(makeRdd(0, Nil), Array(), listener = fakeListener)
+    submit(new MyRDD(sc, 0, Nil), Array(), listener = fakeListener)
     assert(numResults === 0)
   }
 
   test("run trivial job") {
-    submit(makeRdd(1, Nil), Array(0))
+    submit(new MyRDD(sc, 1, Nil), Array(0))
     complete(taskSets(0), List((Success, 42)))
     assert(results === Map(0 -> 42))
     assertDataStructuresEmpty
   }
 
   test("local job") {
-    val rdd = new MyRDD(sc, Nil) {
+    val rdd = new PairOfIntsRDD(sc, Nil) {
       override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
         Array(42 -> 0).iterator
       override def getPartitions = Array( new Partition { override def index = 0 } )
@@ -260,7 +261,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("local job oom") {
-    val rdd = new MyRDD(sc, Nil) {
+    val rdd = new PairOfIntsRDD(sc, Nil) {
       override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
         throw new java.lang.OutOfMemoryError("test local job oom")
       override def getPartitions = Array( new Partition { override def index = 0 } )
@@ -274,8 +275,8 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("run trivial job w/ dependency") {
-    val baseRdd = makeRdd(1, Nil)
-    val finalRdd = makeRdd(1, List(new OneToOneDependency(baseRdd)))
+    val baseRdd = new MyRDD(sc, 1, Nil)
+    val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd)))
     submit(finalRdd, Array(0))
     complete(taskSets(0), Seq((Success, 42)))
     assert(results === Map(0 -> 42))
@@ -283,8 +284,8 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("cache location preferences w/ dependency") {
-    val baseRdd = makeRdd(1, Nil)
-    val finalRdd = makeRdd(1, List(new OneToOneDependency(baseRdd)))
+    val baseRdd = new MyRDD(sc, 1, Nil)
+    val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd)))
     cacheLocations(baseRdd.id -> 0) =
       Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB"))
     submit(finalRdd, Array(0))
@@ -295,8 +296,34 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
     assertDataStructuresEmpty
   }
 
+  test("avoid exponential blowup when getting preferred locs list") {
+    // Build up a complex dependency graph with repeated zip operations, without preferred locations.
+    var rdd: RDD[_] = new MyRDD(sc, 1, Nil)
+    (1 to 30).foreach(_ => rdd = rdd.zip(rdd))
+    // getPreferredLocs runs quickly, indicating that exponential graph traversal is avoided.
+    failAfter(10 seconds) {
+      val preferredLocs = scheduler.getPreferredLocs(rdd,0)
+      // No preferred locations are returned.
+      assert(preferredLocs.length === 0)
+    }
+  }
+
+  test("unserializable task") {
+    val unserializableRdd = new MyRDD(sc, 1, Nil) {
+      class UnserializableClass
+      val unserializable = new UnserializableClass
+    }
+    submit(unserializableRdd, Array(0))
+    assert(failure.getMessage.startsWith(
+      "Job aborted due to stage failure: Task not serializable:"))
+    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    assert(sparkListener.failedStages.contains(0))
+    assert(sparkListener.failedStages.size === 1)
+    assertDataStructuresEmpty
+  }
+
   test("trivial job failure") {
-    submit(makeRdd(1, Nil), Array(0))
+    submit(new MyRDD(sc, 1, Nil), Array(0))
     failed(taskSets(0), "some failure")
     assert(failure.getMessage === "Job aborted due to stage failure: some failure")
     assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
@@ -306,7 +333,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("trivial job cancellation") {
-    val rdd = makeRdd(1, Nil)
+    val rdd = new MyRDD(sc, 1, Nil)
     val jobId = submit(rdd, Array(0))
     cancel(jobId)
     assert(failure.getMessage === s"Job $jobId cancelled ")
@@ -332,6 +359,8 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
       }
       override def setDAGScheduler(dagScheduler: DAGScheduler) = {}
       override def defaultParallelism() = 2
+      override def executorHeartbeatReceived(execId: String, taskMetrics: Array[(Long, TaskMetrics)],
+        blockManagerId: BlockManagerId): Boolean = true
     }
     val noKillScheduler = new DAGScheduler(
       sc,
@@ -347,8 +376,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
     }
     dagEventProcessTestActor = TestActorRef[DAGSchedulerEventProcessActor](
       Props(classOf[DAGSchedulerEventProcessActor], noKillScheduler))(system)
-    val rdd = makeRdd(1, Nil)
-    val jobId = submit(rdd, Array(0))
+    val jobId = submit(new MyRDD(sc, 1, Nil), Array(0))
     cancel(jobId)
     // Because the job wasn't actually cancelled, we shouldn't have received a failure message.
     assert(failure === null)
@@ -364,10 +392,10 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("run trivial shuffle") {
-    val shuffleMapRdd = makeRdd(2, Nil)
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = makeRdd(1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1)),
@@ -380,10 +408,10 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("run trivial shuffle with fetch failure") {
-    val shuffleMapRdd = makeRdd(2, Nil)
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = makeRdd(2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1)),
@@ -406,10 +434,10 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("ignore late map task completions") {
-    val shuffleMapRdd = makeRdd(2, Nil)
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = makeRdd(2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
     // pretend we were told hostA went away
     val oldEpoch = mapOutputTracker.getEpoch
@@ -435,9 +463,9 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("run shuffle with map stage failure") {
-    val shuffleMapRdd = makeRdd(2, Nil)
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
-    val reduceRdd = makeRdd(2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
     submit(reduceRdd, Array(0, 1))
 
     // Fail the map stage.  This should cause the entire job to fail.
@@ -472,13 +500,13 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
    * without shuffleMapRdd1.
    */
   test("failure of stage used by two jobs") {
-    val shuffleMapRdd1 = makeRdd(2, Nil)
+    val shuffleMapRdd1 = new MyRDD(sc, 2, Nil)
     val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, null)
-    val shuffleMapRdd2 = makeRdd(2, Nil)
+    val shuffleMapRdd2 = new MyRDD(sc, 2, Nil)
     val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, null)
 
-    val reduceRdd1 = makeRdd(2, List(shuffleDep1))
-    val reduceRdd2 = makeRdd(2, List(shuffleDep1, shuffleDep2))
+    val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1))
+    val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2))
 
     // We need to make our own listeners for this test, since by default submit uses the same
     // listener for all jobs, and here we want to capture the failure for each job separately.
@@ -511,10 +539,10 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("run trivial shuffle with out-of-band failure and retry") {
-    val shuffleMapRdd = makeRdd(2, Nil)
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = makeRdd(1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
     submit(reduceRdd, Array(0))
     // blockManagerMaster.removeExecutor("exec-hostA")
     // pretend we were told hostA went away
@@ -534,11 +562,11 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("recursive shuffle failures") {
-    val shuffleOneRdd = makeRdd(2, Nil)
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil)
     val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = makeRdd(2, List(shuffleDepOne))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne))
     val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = makeRdd(1, List(shuffleDepTwo))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
     submit(finalRdd, Array(0))
     // have the first stage complete normally
     complete(taskSets(0), Seq(
@@ -563,11 +591,11 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
   }
 
   test("cached post-shuffle") {
-    val shuffleOneRdd = makeRdd(2, Nil)
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil)
     val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = makeRdd(2, List(shuffleDepOne))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne))
     val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = makeRdd(1, List(shuffleDepTwo))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
     submit(finalRdd, Array(0))
     cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD"))
     cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC"))
@@ -677,15 +705,11 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F
     BlockManagerId("exec-" + host, host, 12345, 0)
 
   private def assertDataStructuresEmpty = {
-    assert(scheduler.pendingTasks.isEmpty)
     assert(scheduler.activeJobs.isEmpty)
     assert(scheduler.failedStages.isEmpty)
     assert(scheduler.jobIdToActiveJob.isEmpty)
     assert(scheduler.jobIdToStageIds.isEmpty)
-    assert(scheduler.stageIdToJobIds.isEmpty)
     assert(scheduler.stageIdToStage.isEmpty)
-    assert(scheduler.stageToInfos.isEmpty)
-    assert(scheduler.resultStageToJob.isEmpty)
     assert(scheduler.runningStages.isEmpty)
     assert(scheduler.shuffleToMapStage.isEmpty)
     assert(scheduler.waitingStages.isEmpty)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index 21e3db34b8b7a..10d8b299317ea 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -259,7 +259,7 @@ class EventLoggingListenerSuite extends FunSuite with BeforeAndAfter {
     assert(sc.eventLogger.isDefined)
     val eventLogger = sc.eventLogger.get
     val expectedLogDir = logDirPath.toString
-    assert(eventLogger.logDir.startsWith(expectedLogDir))
+    assert(eventLogger.logDir.contains(expectedLogDir))
 
     // Begin listening for events that trigger asserts
     val eventExistenceListener = new EventExistenceListener(eventLogger)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 71f48e295ecca..3b0b8e2f68c97 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -258,8 +258,8 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers
         if (stageInfo.rddInfos.exists(_.name == d4.name)) {
           taskMetrics.shuffleReadMetrics should be ('defined)
           val sm = taskMetrics.shuffleReadMetrics.get
-          sm.totalBlocksFetched should be > (0)
-          sm.localBlocksFetched should be > (0)
+          sm.totalBlocksFetched should be (128)
+          sm.localBlocksFetched should be (128)
           sm.remoteBlocksFetched should be (0)
           sm.remoteBytesRead should be (0l)
         }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index 8bb5317cd2875..270f7e661045a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -20,31 +20,35 @@ package org.apache.spark.scheduler
 import org.scalatest.FunSuite
 import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.LocalSparkContext
-import org.apache.spark.Partition
-import org.apache.spark.SparkContext
-import org.apache.spark.TaskContext
+import org.apache.spark._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
 
 class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
 
   test("Calls executeOnCompleteCallbacks after failure") {
-    var completed = false
+    TaskContextSuite.completed = false
     sc = new SparkContext("local", "test")
     val rdd = new RDD[String](sc, List()) {
       override def getPartitions = Array[Partition](StubPartition(0))
       override def compute(split: Partition, context: TaskContext) = {
-        context.addOnCompleteCallback(() => completed = true)
+        context.addOnCompleteCallback(() => TaskContextSuite.completed = true)
         sys.error("failed")
       }
     }
-    val func = (c: TaskContext, i: Iterator[String]) => i.next
-    val task = new ResultTask[String, String](0, rdd, func, 0, Seq(), 0)
+    val closureSerializer = SparkEnv.get.closureSerializer.newInstance()
+    val func = (c: TaskContext, i: Iterator[String]) => i.next()
+    val task = new ResultTask[String, String](
+      0, sc.broadcast(closureSerializer.serialize((rdd, func)).array), rdd.partitions(0), Seq(), 0)
     intercept[RuntimeException] {
       task.run(0)
     }
-    assert(completed === true)
+    assert(TaskContextSuite.completed === true)
   }
+}
 
-  case class StubPartition(val index: Int) extends Partition
+private object TaskContextSuite {
+  @volatile var completed = false
 }
+
+private case class StubPartition(index: Int) extends Partition
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 9ff2a487005c4..c52368b5514db 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -54,6 +54,23 @@ class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler)
   }
 }
 
+// Get the rack for a given host
+object FakeRackUtil {
+  private val hostToRack = new mutable.HashMap[String, String]()
+
+  def cleanUp() {
+    hostToRack.clear()
+  }
+
+  def assignHostToRack(host: String, rack: String) {
+    hostToRack(host) = rack
+  }
+
+  def getRackForHost(host: String) = {
+    hostToRack.get(host)
+  }
+}
+
 /**
  * A mock TaskSchedulerImpl implementation that just remembers information about tasks started and
  * feedback received from the TaskSetManagers. Note that it's important to initialize this with
@@ -69,6 +86,9 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex
   val taskSetsFailed = new ArrayBuffer[String]
 
   val executors = new mutable.HashMap[String, String] ++ liveExecutors
+  for ((execId, host) <- liveExecutors; rack <- getRackForHost(host)) {
+    hostsByRack.getOrElseUpdate(rack, new mutable.HashSet[String]()) += host
+  }
 
   dagScheduler = new FakeDAGScheduler(sc, this)
 
@@ -82,7 +102,12 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex
 
   def addExecutor(execId: String, host: String) {
     executors.put(execId, host)
+    for (rack <- getRackForHost(host)) {
+      hostsByRack.getOrElseUpdate(rack, new mutable.HashSet[String]()) += host
+    }
   }
+
+  override def getRackForHost(value: String): Option[String] = FakeRackUtil.getRackForHost(value)
 }
 
 /**
@@ -419,6 +444,9 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
   }
 
   test("new executors get added") {
+    // Assign host2 to rack2
+    FakeRackUtil.cleanUp()
+    FakeRackUtil.assignHostToRack("host2", "rack2")
     sc = new SparkContext("local", "test")
     val sched = new FakeTaskScheduler(sc)
     val taskSet = FakeTask.createTaskSet(4,
@@ -444,8 +472,41 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     manager.executorAdded()
     // No-pref list now only contains task 3
     assert(manager.pendingTasksWithNoPrefs.size === 1)
-    // Valid locality should contain PROCESS_LOCAL, NODE_LOCAL and ANY
-    assert(manager.myLocalityLevels.sameElements(Array(PROCESS_LOCAL, NODE_LOCAL, ANY)))
+    // Valid locality should contain PROCESS_LOCAL, NODE_LOCAL, RACK_LOCAL and ANY
+    assert(manager.myLocalityLevels.sameElements(
+      Array(PROCESS_LOCAL, NODE_LOCAL, RACK_LOCAL, ANY)))
+    FakeRackUtil.cleanUp()
+  }
+
+  test("test RACK_LOCAL tasks") {
+    FakeRackUtil.cleanUp()
+    // Assign host1 to rack1
+    FakeRackUtil.assignHostToRack("host1", "rack1")
+    // Assign host2 to rack1
+    FakeRackUtil.assignHostToRack("host2", "rack1")
+    // Assign host3 to rack2
+    FakeRackUtil.assignHostToRack("host3", "rack2")
+    sc = new SparkContext("local", "test")
+    val sched = new FakeTaskScheduler(sc,
+      ("execA", "host1"), ("execB", "host2"), ("execC", "host3"))
+    val taskSet = FakeTask.createTaskSet(2,
+      Seq(TaskLocation("host1", "execA")),
+      Seq(TaskLocation("host1", "execA")))
+    val clock = new FakeClock
+    val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
+
+    assert(manager.myLocalityLevels.sameElements(Array(PROCESS_LOCAL, NODE_LOCAL, RACK_LOCAL, ANY)))
+    // Set allowed locality to ANY
+    clock.advance(LOCALITY_WAIT * 3)
+    // Offer host3
+    // No task is scheduled if we restrict locality to RACK_LOCAL
+    assert(manager.resourceOffer("execC", "host3", RACK_LOCAL) === None)
+    // Task 0 can be scheduled with ANY
+    assert(manager.resourceOffer("execC", "host3", ANY).get.index === 0)
+    // Offer host2
+    // Task 1 can be scheduled with RACK_LOCAL
+    assert(manager.resourceOffer("execB", "host2", RACK_LOCAL).get.index === 1)
+    FakeRackUtil.cleanUp()
   }
 
   test("do not emit warning when serialized task is small") {
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 79280d1a06653..789b773bae316 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -209,6 +209,36 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext {
   }
 }
 
+class KryoSerializerResizableOutputSuite extends FunSuite {
+  import org.apache.spark.SparkConf
+  import org.apache.spark.SparkContext
+  import org.apache.spark.LocalSparkContext
+  import org.apache.spark.SparkException
+
+  // trial and error showed this will not serialize with 1mb buffer
+  val x = (1 to 400000).toArray
+
+  test("kryo without resizable output buffer should fail on large array") {
+    val conf = new SparkConf(false)
+    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    conf.set("spark.kryoserializer.buffer.mb", "1")
+    conf.set("spark.kryoserializer.buffer.max.mb", "1")
+    val sc = new SparkContext("local", "test", conf)
+    intercept[SparkException](sc.parallelize(x).collect)
+    LocalSparkContext.stop(sc)
+  }
+
+  test("kryo with resizable output buffer should succeed on large array") {
+    val conf = new SparkConf(false)
+    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    conf.set("spark.kryoserializer.buffer.mb", "1")
+    conf.set("spark.kryoserializer.buffer.max.mb", "2")
+    val sc = new SparkContext("local", "test", conf)
+    assert(sc.parallelize(x).collect === x)
+    LocalSparkContext.stop(sc)
+  }
+}
+
 object KryoTest {
   case class CaseClass(i: Int, s: String) {}
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
index 5d15a68ac7e4f..aad6599589420 100644
--- a/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/ProactiveClosureSerializationSuite.scala
@@ -15,15 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.serializer;
-
-import java.io.NotSerializableException
+package org.apache.spark.serializer
 
 import org.scalatest.FunSuite
 
+import org.apache.spark.{SharedSparkContext, SparkException}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.SparkException
-import org.apache.spark.SharedSparkContext
 
 /* A trivial (but unserializable) container for trivial functions */
 class UnserializableClass {
@@ -38,52 +35,50 @@ class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContex
 
   test("throws expected serialization exceptions on actions") {
     val (data, uc) = fixture
-      
     val ex = intercept[SparkException] {
-      data.map(uc.op(_)).count
+      data.map(uc.op(_)).count()
     }
-        
     assert(ex.getMessage.contains("Task not serializable"))
   }
 
   // There is probably a cleaner way to eliminate boilerplate here, but we're
   // iterating over a map from transformation names to functions that perform that
   // transformation on a given RDD, creating one test case for each
-  
+
   for (transformation <- 
-      Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, 
-          "mapWith" -> xmapWith _, "mapPartitions" -> xmapPartitions _, 
+      Map("map" -> xmap _,
+          "flatMap" -> xflatMap _,
+          "filter" -> xfilter _,
+          "mapPartitions" -> xmapPartitions _,
           "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _,
-          "mapPartitionsWithContext" -> xmapPartitionsWithContext _, 
-          "filterWith" -> xfilterWith _)) {
+          "mapPartitionsWithContext" -> xmapPartitionsWithContext _)) {
     val (name, xf) = transformation
-    
+
     test(s"$name transformations throw proactive serialization exceptions") {
       val (data, uc) = fixture
-      
       val ex = intercept[SparkException] {
         xf(data, uc)
       }
-
       assert(ex.getMessage.contains("Task not serializable"), 
         s"RDD.$name doesn't proactively throw NotSerializableException")
     }
   }
-  
+
   private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
     x.map(y=>uc.op(y))
-  private def xmapWith(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.mapWith(x => x.toString)((x,y)=>x + uc.op(y))
+
   private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
     x.flatMap(y=>Seq(uc.op(y)))
+
   private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = 
     x.filter(y=>uc.pred(y))
-  private def xfilterWith(x: RDD[String], uc: UnserializableClass): RDD[String] = 
-    x.filterWith(x => x.toString)((x,y)=>uc.pred(y))
+
   private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = 
     x.mapPartitions(_.map(y=>uc.op(y)))
+
   private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = 
     x.mapPartitionsWithIndex((_, it) => it.map(y=>uc.op(y)))
+
   private def xmapPartitionsWithContext(x: RDD[String], uc: UnserializableClass): RDD[String] = 
     x.mapPartitionsWithContext((_, it) => it.map(y=>uc.op(y)))
   
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockFetcherIteratorSuite.scala
new file mode 100644
index 0000000000000..8dca2ebb312f5
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/storage/BlockFetcherIteratorSuite.scala
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import org.scalatest.{FunSuite, Matchers}
+import org.scalatest.PrivateMethodTester._
+
+import org.mockito.Mockito._
+import org.mockito.Matchers.{any, eq => meq}
+import org.mockito.stubbing.Answer
+import org.mockito.invocation.InvocationOnMock
+
+import org.apache.spark._
+import org.apache.spark.storage.BlockFetcherIterator._
+import org.apache.spark.network.{ConnectionManager, ConnectionManagerId,
+                                 Message}
+
+class BlockFetcherIteratorSuite extends FunSuite with Matchers {
+
+  test("block fetch from local fails using BasicBlockFetcherIterator") {
+    val blockManager = mock(classOf[BlockManager])
+    val connManager = mock(classOf[ConnectionManager])
+    doReturn(connManager).when(blockManager).connectionManager
+    doReturn(BlockManagerId("test-client", "test-client", 1, 0)).when(blockManager).blockManagerId
+
+    doReturn((48 * 1024 * 1024).asInstanceOf[Long]).when(blockManager).maxBytesInFlight
+
+    val blIds = Array[BlockId](
+      ShuffleBlockId(0,0,0),
+      ShuffleBlockId(0,1,0),
+      ShuffleBlockId(0,2,0),
+      ShuffleBlockId(0,3,0),
+      ShuffleBlockId(0,4,0))
+
+    val optItr = mock(classOf[Option[Iterator[Any]]])
+    val answer = new Answer[Option[Iterator[Any]]] {
+      override def answer(invocation: InvocationOnMock) = Option[Iterator[Any]] {
+        throw new Exception
+      }
+    }
+
+    // 3rd block is going to fail
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(0)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(1)), any())
+    doAnswer(answer).when(blockManager).getLocalFromDisk(meq(blIds(2)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(3)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(4)), any())
+
+    val bmId = BlockManagerId("test-client", "test-client",1 , 0)
+    val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
+      (bmId, blIds.map(blId => (blId, 1.asInstanceOf[Long])).toSeq)
+    )
+
+    val iterator = new BasicBlockFetcherIterator(blockManager,
+      blocksByAddress, null)
+
+    iterator.initialize()
+
+    // 3rd getLocalFromDisk invocation should be failed
+    verify(blockManager, times(3)).getLocalFromDisk(any(), any())
+
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has no elements")
+    // the 2nd element of the tuple returned by iterator.next should be defined when fetching successfully
+    assert(iterator.next._2.isDefined, "1st element should be defined but is not actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 1 element")
+    assert(iterator.next._2.isDefined, "2nd element should be defined but is not actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 2 elements")
+    // 3rd fetch should be failed
+    assert(!iterator.next._2.isDefined, "3rd element should not be defined but is actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 3 elements")
+    // Don't call next() after fetching non-defined element even if thare are rest of elements in the iterator.
+    // Otherwise, BasicBlockFetcherIterator hangs up.
+  }
+
+
+  test("block fetch from local succeed using BasicBlockFetcherIterator") {
+    val blockManager = mock(classOf[BlockManager])
+    val connManager = mock(classOf[ConnectionManager])
+    doReturn(connManager).when(blockManager).connectionManager
+    doReturn(BlockManagerId("test-client", "test-client", 1, 0)).when(blockManager).blockManagerId
+
+    doReturn((48 * 1024 * 1024).asInstanceOf[Long]).when(blockManager).maxBytesInFlight
+
+    val blIds = Array[BlockId](
+      ShuffleBlockId(0,0,0),
+      ShuffleBlockId(0,1,0),
+      ShuffleBlockId(0,2,0),
+      ShuffleBlockId(0,3,0),
+      ShuffleBlockId(0,4,0))
+
+    val optItr = mock(classOf[Option[Iterator[Any]]])
+ 
+   // All blocks should be fetched successfully
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(0)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(1)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(2)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(3)), any())
+    doReturn(optItr).when(blockManager).getLocalFromDisk(meq(blIds(4)), any())
+
+    val bmId = BlockManagerId("test-client", "test-client",1 , 0)
+    val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
+      (bmId, blIds.map(blId => (blId, 1.asInstanceOf[Long])).toSeq)
+    )
+
+    val iterator = new BasicBlockFetcherIterator(blockManager,
+      blocksByAddress, null)
+
+    iterator.initialize()
+
+    // getLocalFromDis should be invoked for all of 5 blocks
+    verify(blockManager, times(5)).getLocalFromDisk(any(), any())
+
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has no elements")
+    assert(iterator.next._2.isDefined, "All elements should be defined but 1st element is not actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 1 element")
+    assert(iterator.next._2.isDefined, "All elements should be defined but 2nd element is not actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 2 elements")
+    assert(iterator.next._2.isDefined, "All elements should be defined but 3rd element is not actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 3 elements")
+    assert(iterator.next._2.isDefined, "All elements should be defined but 4th element is not actually defined") 
+    assert(iterator.hasNext, "iterator should have 5 elements but actually has 4 elements")
+    assert(iterator.next._2.isDefined, "All elements should be defined but 5th element is not actually defined") 
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 23cb6905bfdeb..0ac0269d7cfc1 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -19,30 +19,34 @@ package org.apache.spark.storage
 
 import java.nio.{ByteBuffer, MappedByteBuffer}
 import java.util.Arrays
+import java.util.concurrent.TimeUnit
 
 import akka.actor._
-import org.apache.spark.SparkConf
-import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
-import org.apache.spark.util.{AkkaUtils, ByteBufferInputStream, SizeEstimator, Utils}
+import akka.pattern.ask
+import akka.util.Timeout
+
 import org.mockito.Mockito.{mock, when}
 import org.scalatest.{BeforeAndAfter, FunSuite, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts._
 import org.scalatest.Matchers
-import org.scalatest.time.SpanSugar._
 
-import org.apache.spark.{MapOutputTrackerMaster, SecurityManager, SparkConf, SparkContext}
+import org.apache.spark.{MapOutputTrackerMaster, SecurityManager, SparkConf}
 import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
+import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
 import org.apache.spark.util.{AkkaUtils, ByteBufferInputStream, SizeEstimator, Utils}
 
 import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.Await
+import scala.concurrent.duration._
 import scala.language.implicitConversions
 import scala.language.postfixOps
 
 class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   with PrivateMethodTester {
+
   private val conf = new SparkConf(false)
   var store: BlockManager = null
   var store2: BlockManager = null
@@ -61,21 +65,28 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   implicit def StringToBlockId(value: String): BlockId = new TestBlockId(value)
   def rdd(rddId: Int, splitId: Int) = RDDBlockId(rddId, splitId)
 
+  private def makeBlockManager(maxMem: Long, name: String = "<driver>"): BlockManager = {
+    new BlockManager(
+      name, actorSystem, master, serializer, maxMem, conf, securityMgr, mapOutputTracker)
+  }
+
   before {
-    val (actorSystem, boundPort) = AkkaUtils.createActorSystem("test", "localhost", 0, conf = conf,
-      securityManager = securityMgr)
+    val (actorSystem, boundPort) = AkkaUtils.createActorSystem(
+      "test", "localhost", 0, conf = conf, securityManager = securityMgr)
     this.actorSystem = actorSystem
+
+    // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case
+    oldArch = System.setProperty("os.arch", "amd64")
+    conf.set("os.arch", "amd64")
+    conf.set("spark.test.useCompressedOops", "true")
     conf.set("spark.driver.port", boundPort.toString)
+    conf.set("spark.storage.unrollFraction", "0.4")
+    conf.set("spark.storage.unrollMemoryThreshold", "512")
 
     master = new BlockManagerMaster(
       actorSystem.actorOf(Props(new BlockManagerMasterActor(true, conf, new LiveListenerBus))),
       conf)
 
-    // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case
-    oldArch = System.setProperty("os.arch", "amd64")
-    conf.set("os.arch", "amd64")
-    conf.set("spark.test.useCompressedOops", "true")
-    conf.set("spark.storage.disableBlockManagerHeartBeat", "true")
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
   }
@@ -138,11 +149,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("master + 1 manager interaction") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(20000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
 
     // Putting a1, a2  and a3 in memory and telling master only about a1 and a2
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY)
@@ -169,10 +179,8 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("master + 2 managers interaction") {
-    store = new BlockManager("exec1", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
-    store2 = new BlockManager("exec2", actorSystem, master, new KryoSerializer(conf), 2000, conf,
-      securityMgr, mapOutputTracker)
+    store = makeBlockManager(2000, "exec1")
+    store2 = makeBlockManager(2000, "exec2")
 
     val peers = master.getPeers(store.blockManagerId, 1)
     assert(peers.size === 1, "master did not return the other manager as a peer")
@@ -187,11 +195,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("removing block") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(20000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
 
     // Putting a1, a2 and a3 in memory and telling master only about a1 and a2
     store.putSingle("a1-to-remove", a1, StorageLevel.MEMORY_ONLY)
@@ -200,8 +207,8 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
 
     // Checking whether blocks are in memory and memory size
     val memStatus = master.getMemoryStatus.head._2
-    assert(memStatus._1 == 2000L, "total memory " + memStatus._1 + " should equal 2000")
-    assert(memStatus._2 <= 1200L, "remaining memory " + memStatus._2 + " should <= 1200")
+    assert(memStatus._1 == 20000L, "total memory " + memStatus._1 + " should equal 20000")
+    assert(memStatus._2 <= 12000L, "remaining memory " + memStatus._2 + " should <= 12000")
     assert(store.getSingle("a1-to-remove").isDefined, "a1 was not in store")
     assert(store.getSingle("a2-to-remove").isDefined, "a2 was not in store")
     assert(store.getSingle("a3-to-remove").isDefined, "a3 was not in store")
@@ -230,17 +237,16 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     }
     eventually(timeout(1000 milliseconds), interval(10 milliseconds)) {
       val memStatus = master.getMemoryStatus.head._2
-      memStatus._1 should equal (2000L)
-      memStatus._2 should equal (2000L)
+      memStatus._1 should equal (20000L)
+      memStatus._2 should equal (20000L)
     }
   }
 
   test("removing rdd") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(20000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     // Putting a1, a2 and a3 in memory.
     store.putSingle(rdd(0, 0), a1, StorageLevel.MEMORY_ONLY)
     store.putSingle(rdd(0, 1), a2, StorageLevel.MEMORY_ONLY)
@@ -270,11 +276,9 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("removing broadcast") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
+    store = makeBlockManager(2000)
     val driverStore = store
-    val executorStore = new BlockManager("executor", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
+    val executorStore = makeBlockManager(2000, "executor")
     val a1 = new Array[Byte](400)
     val a2 = new Array[Byte](400)
     val a3 = new Array[Byte](400)
@@ -342,9 +346,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("reregistration on heart beat") {
-    val heartBeat = PrivateMethod[Unit]('heartBeat)
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
+    store = makeBlockManager(2000)
     val a1 = new Array[Byte](400)
 
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY)
@@ -355,13 +357,15 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     master.removeExecutor(store.blockManagerId.executorId)
     assert(master.getLocations("a1").size == 0, "a1 was not removed from master")
 
-    store invokePrivate heartBeat()
-    assert(master.getLocations("a1").size > 0, "a1 was not reregistered with master")
+    implicit val timeout = Timeout(30, TimeUnit.SECONDS)
+    val reregister = !Await.result(
+      master.driverActor ? BlockManagerHeartbeat(store.blockManagerId),
+      timeout.duration).asInstanceOf[Boolean]
+    assert(reregister == true)
   }
 
   test("reregistration on block update") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
+    store = makeBlockManager(2000)
     val a1 = new Array[Byte](400)
     val a2 = new Array[Byte](400)
 
@@ -379,9 +383,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("reregistration doesn't dead lock") {
-    val heartBeat = PrivateMethod[Unit]('heartBeat)
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 2000, conf,
-      securityMgr, mapOutputTracker)
+    store = makeBlockManager(2000)
     val a1 = new Array[Byte](400)
     val a2 = List(new Array[Byte](400))
 
@@ -390,7 +392,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
       master.removeExecutor(store.blockManagerId.executorId)
       val t1 = new Thread {
         override def run() {
-          store.put("a2", a2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+          store.putIterator("a2", a2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
         }
       }
       val t2 = new Thread {
@@ -400,7 +402,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
       }
       val t3 = new Thread {
         override def run() {
-          store invokePrivate heartBeat()
+          store.reregister()
         }
       }
 
@@ -418,19 +420,14 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("correct BlockResult returned from get() calls") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf, securityMgr,
-      mapOutputTracker)
-    val list1 = List(new Array[Byte](200), new Array[Byte](200))
-    val list1ForSizeEstimate = new ArrayBuffer[Any]
-    list1ForSizeEstimate ++= list1.iterator
-    val list1SizeEstimate = SizeEstimator.estimate(list1ForSizeEstimate)
-    val list2 = List(new Array[Byte](50), new Array[Byte](100), new Array[Byte](150))
-    val list2ForSizeEstimate = new ArrayBuffer[Any]
-    list2ForSizeEstimate ++= list2.iterator
-    val list2SizeEstimate = SizeEstimator.estimate(list2ForSizeEstimate)
-    store.put("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    store.put("list2memory", list2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    store.put("list2disk", list2.iterator, StorageLevel.DISK_ONLY, tellMaster = true)
+    store = makeBlockManager(12000)
+    val list1 = List(new Array[Byte](2000), new Array[Byte](2000))
+    val list2 = List(new Array[Byte](500), new Array[Byte](1000), new Array[Byte](1500))
+    val list1SizeEstimate = SizeEstimator.estimate(list1.iterator.toArray)
+    val list2SizeEstimate = SizeEstimator.estimate(list2.iterator.toArray)
+    store.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list2memory", list2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list2disk", list2.iterator, StorageLevel.DISK_ONLY, tellMaster = true)
     val list1Get = store.get("list1")
     assert(list1Get.isDefined, "list1 expected to be in store")
     assert(list1Get.get.data.size === 2)
@@ -451,11 +448,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("in-memory LRU storage") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY)
     store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY)
     store.putSingle("a3", a3, StorageLevel.MEMORY_ONLY)
@@ -471,11 +467,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("in-memory LRU storage with serialization") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY_SER)
     store.putSingle("a3", a3, StorageLevel.MEMORY_ONLY_SER)
@@ -491,11 +486,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("in-memory LRU for partitions of same RDD") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle(rdd(0, 1), a1, StorageLevel.MEMORY_ONLY)
     store.putSingle(rdd(0, 2), a2, StorageLevel.MEMORY_ONLY)
     store.putSingle(rdd(0, 3), a3, StorageLevel.MEMORY_ONLY)
@@ -511,11 +505,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("in-memory LRU for partitions of multiple RDDs") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    store.putSingle(rdd(0, 1), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
-    store.putSingle(rdd(0, 2), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
-    store.putSingle(rdd(1, 1), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
+    store = makeBlockManager(12000)
+    store.putSingle(rdd(0, 1), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
+    store.putSingle(rdd(0, 2), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
+    store.putSingle(rdd(1, 1), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
     // At this point rdd_1_1 should've replaced rdd_0_1
     assert(store.memoryStore.contains(rdd(1, 1)), "rdd_1_1 was not in store")
     assert(!store.memoryStore.contains(rdd(0, 1)), "rdd_0_1 was in store")
@@ -523,8 +516,8 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     // Do a get() on rdd_0_2 so that it is the most recently used item
     assert(store.getSingle(rdd(0, 2)).isDefined, "rdd_0_2 was not in store")
     // Put in more partitions from RDD 0; they should replace rdd_1_1
-    store.putSingle(rdd(0, 3), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
-    store.putSingle(rdd(0, 4), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
+    store.putSingle(rdd(0, 3), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
+    store.putSingle(rdd(0, 4), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
     // Now rdd_1_1 should be dropped to add rdd_0_3, but then rdd_0_2 should *not* be dropped
     // when we try to add rdd_0_4.
     assert(!store.memoryStore.contains(rdd(1, 1)), "rdd_1_1 was in store")
@@ -538,8 +531,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     // TODO Make the spark.test.tachyon.enable true after using tachyon 0.5.0 testing jar.
     val tachyonUnitTestEnabled = conf.getBoolean("spark.test.tachyon.enable", false)
     if (tachyonUnitTestEnabled) {
-      store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-        securityMgr, mapOutputTracker)
+      store = makeBlockManager(1200)
       val a1 = new Array[Byte](400)
       val a2 = new Array[Byte](400)
       val a3 = new Array[Byte](400)
@@ -555,8 +547,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("on-disk storage") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
+    store = makeBlockManager(1200)
     val a1 = new Array[Byte](400)
     val a2 = new Array[Byte](400)
     val a3 = new Array[Byte](400)
@@ -569,11 +560,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("disk and memory storage") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK)
@@ -585,11 +575,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("disk and memory storage with getLocalBytes") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK)
@@ -601,11 +590,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("disk and memory storage with serialization") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK_SER)
@@ -617,11 +605,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("disk and memory storage with serialization and getLocalBytes") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK_SER)
@@ -633,12 +620,11 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("LRU with mixed storage levels") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val a1 = new Array[Byte](400)
-    val a2 = new Array[Byte](400)
-    val a3 = new Array[Byte](400)
-    val a4 = new Array[Byte](400)
+    store = makeBlockManager(12000)
+    val a1 = new Array[Byte](4000)
+    val a2 = new Array[Byte](4000)
+    val a3 = new Array[Byte](4000)
+    val a4 = new Array[Byte](4000)
     // First store a1 and a2, both in memory, and a3, on disk only
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY_SER)
@@ -656,14 +642,13 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("in-memory LRU with streams") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val list1 = List(new Array[Byte](200), new Array[Byte](200))
-    val list2 = List(new Array[Byte](200), new Array[Byte](200))
-    val list3 = List(new Array[Byte](200), new Array[Byte](200))
-    store.put("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    store.put("list2", list2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    store.put("list3", list3.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store = makeBlockManager(12000)
+    val list1 = List(new Array[Byte](2000), new Array[Byte](2000))
+    val list2 = List(new Array[Byte](2000), new Array[Byte](2000))
+    val list3 = List(new Array[Byte](2000), new Array[Byte](2000))
+    store.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list2", list2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list3", list3.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     assert(store.get("list2").isDefined, "list2 was not in store")
     assert(store.get("list2").get.data.size === 2)
     assert(store.get("list3").isDefined, "list3 was not in store")
@@ -672,7 +657,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     assert(store.get("list2").isDefined, "list2 was not in store")
     assert(store.get("list2").get.data.size === 2)
     // At this point list2 was gotten last, so LRU will getSingle rid of list3
-    store.put("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     assert(store.get("list1").isDefined, "list1 was not in store")
     assert(store.get("list1").get.data.size === 2)
     assert(store.get("list2").isDefined, "list2 was not in store")
@@ -681,16 +666,15 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("LRU with mixed storage levels and streams") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val list1 = List(new Array[Byte](200), new Array[Byte](200))
-    val list2 = List(new Array[Byte](200), new Array[Byte](200))
-    val list3 = List(new Array[Byte](200), new Array[Byte](200))
-    val list4 = List(new Array[Byte](200), new Array[Byte](200))
+    store = makeBlockManager(12000)
+    val list1 = List(new Array[Byte](2000), new Array[Byte](2000))
+    val list2 = List(new Array[Byte](2000), new Array[Byte](2000))
+    val list3 = List(new Array[Byte](2000), new Array[Byte](2000))
+    val list4 = List(new Array[Byte](2000), new Array[Byte](2000))
     // First store list1 and list2, both in memory, and list3, on disk only
-    store.put("list1", list1.iterator, StorageLevel.MEMORY_ONLY_SER, tellMaster = true)
-    store.put("list2", list2.iterator, StorageLevel.MEMORY_ONLY_SER, tellMaster = true)
-    store.put("list3", list3.iterator, StorageLevel.DISK_ONLY, tellMaster = true)
+    store.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY_SER, tellMaster = true)
+    store.putIterator("list2", list2.iterator, StorageLevel.MEMORY_ONLY_SER, tellMaster = true)
+    store.putIterator("list3", list3.iterator, StorageLevel.DISK_ONLY, tellMaster = true)
     val listForSizeEstimate = new ArrayBuffer[Any]
     listForSizeEstimate ++= list1.iterator
     val listSize = SizeEstimator.estimate(listForSizeEstimate)
@@ -708,7 +692,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     assert(store.get("list3").isDefined, "list3 was not in store")
     assert(store.get("list3").get.data.size === 2)
     // Now let's add in list4, which uses both disk and memory; list1 should drop out
-    store.put("list4", list4.iterator, StorageLevel.MEMORY_AND_DISK_SER, tellMaster = true)
+    store.putIterator("list4", list4.iterator, StorageLevel.MEMORY_AND_DISK_SER, tellMaster = true)
     assert(store.get("list1") === None, "list1 was in store")
     assert(store.get("list2").isDefined, "list2 was not in store")
     assert(store.get("list2").get.data.size === 2)
@@ -731,11 +715,10 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("overly large block") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 500, conf,
-      securityMgr, mapOutputTracker)
-    store.putSingle("a1", new Array[Byte](1000), StorageLevel.MEMORY_ONLY)
+    store = makeBlockManager(5000)
+    store.putSingle("a1", new Array[Byte](10000), StorageLevel.MEMORY_ONLY)
     assert(store.getSingle("a1") === None, "a1 was in store")
-    store.putSingle("a2", new Array[Byte](1000), StorageLevel.MEMORY_AND_DISK)
+    store.putSingle("a2", new Array[Byte](10000), StorageLevel.MEMORY_AND_DISK)
     assert(store.memoryStore.getValues("a2") === None, "a2 was in memory store")
     assert(store.getSingle("a2").isDefined, "a2 was not in store")
   }
@@ -743,8 +726,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   test("block compression") {
     try {
       conf.set("spark.shuffle.compress", "true")
-      store = new BlockManager("exec1", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
+      store = makeBlockManager(20000, "exec1")
       store.putSingle(ShuffleBlockId(0, 0, 0), new Array[Byte](1000), StorageLevel.MEMORY_ONLY_SER)
       assert(store.memoryStore.getSize(ShuffleBlockId(0, 0, 0)) <= 100,
         "shuffle_0_0_0 was not compressed")
@@ -752,52 +734,46 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
       store = null
 
       conf.set("spark.shuffle.compress", "false")
-      store = new BlockManager("exec2", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
-      store.putSingle(ShuffleBlockId(0, 0, 0), new Array[Byte](1000), StorageLevel.MEMORY_ONLY_SER)
-      assert(store.memoryStore.getSize(ShuffleBlockId(0, 0, 0)) >= 1000,
+      store = makeBlockManager(20000, "exec2")
+      store.putSingle(ShuffleBlockId(0, 0, 0), new Array[Byte](10000), StorageLevel.MEMORY_ONLY_SER)
+      assert(store.memoryStore.getSize(ShuffleBlockId(0, 0, 0)) >= 10000,
         "shuffle_0_0_0 was compressed")
       store.stop()
       store = null
 
       conf.set("spark.broadcast.compress", "true")
-      store = new BlockManager("exec3", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
-      store.putSingle(BroadcastBlockId(0), new Array[Byte](1000), StorageLevel.MEMORY_ONLY_SER)
-      assert(store.memoryStore.getSize(BroadcastBlockId(0)) <= 100,
+      store = makeBlockManager(20000, "exec3")
+      store.putSingle(BroadcastBlockId(0), new Array[Byte](10000), StorageLevel.MEMORY_ONLY_SER)
+      assert(store.memoryStore.getSize(BroadcastBlockId(0)) <= 1000,
         "broadcast_0 was not compressed")
       store.stop()
       store = null
 
       conf.set("spark.broadcast.compress", "false")
-      store = new BlockManager("exec4", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
-      store.putSingle(BroadcastBlockId(0), new Array[Byte](1000), StorageLevel.MEMORY_ONLY_SER)
-      assert(store.memoryStore.getSize(BroadcastBlockId(0)) >= 1000, "broadcast_0 was compressed")
+      store = makeBlockManager(20000, "exec4")
+      store.putSingle(BroadcastBlockId(0), new Array[Byte](10000), StorageLevel.MEMORY_ONLY_SER)
+      assert(store.memoryStore.getSize(BroadcastBlockId(0)) >= 10000, "broadcast_0 was compressed")
       store.stop()
       store = null
 
       conf.set("spark.rdd.compress", "true")
-      store = new BlockManager("exec5", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
-      store.putSingle(rdd(0, 0), new Array[Byte](1000), StorageLevel.MEMORY_ONLY_SER)
-      assert(store.memoryStore.getSize(rdd(0, 0)) <= 100, "rdd_0_0 was not compressed")
+      store = makeBlockManager(20000, "exec5")
+      store.putSingle(rdd(0, 0), new Array[Byte](10000), StorageLevel.MEMORY_ONLY_SER)
+      assert(store.memoryStore.getSize(rdd(0, 0)) <= 1000, "rdd_0_0 was not compressed")
       store.stop()
       store = null
 
       conf.set("spark.rdd.compress", "false")
-      store = new BlockManager("exec6", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
-      store.putSingle(rdd(0, 0), new Array[Byte](1000), StorageLevel.MEMORY_ONLY_SER)
-      assert(store.memoryStore.getSize(rdd(0, 0)) >= 1000, "rdd_0_0 was compressed")
+      store = makeBlockManager(20000, "exec6")
+      store.putSingle(rdd(0, 0), new Array[Byte](10000), StorageLevel.MEMORY_ONLY_SER)
+      assert(store.memoryStore.getSize(rdd(0, 0)) >= 10000, "rdd_0_0 was compressed")
       store.stop()
       store = null
 
       // Check that any other block types are also kept uncompressed
-      store = new BlockManager("exec7", actorSystem, master, serializer, 2000, conf,
-        securityMgr, mapOutputTracker)
-      store.putSingle("other_block", new Array[Byte](1000), StorageLevel.MEMORY_ONLY)
-      assert(store.memoryStore.getSize("other_block") >= 1000, "other_block was compressed")
+      store = makeBlockManager(20000, "exec7")
+      store.putSingle("other_block", new Array[Byte](10000), StorageLevel.MEMORY_ONLY)
+      assert(store.memoryStore.getSize("other_block") >= 10000, "other_block was compressed")
       store.stop()
       store = null
     } finally {
@@ -871,30 +847,29 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     assert(Arrays.equals(mappedAsArray, bytes))
     assert(Arrays.equals(notMappedAsArray, bytes))
   }
-  
+
   test("updated block statuses") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val list = List.fill(2)(new Array[Byte](200))
-    val bigList = List.fill(8)(new Array[Byte](200))
+    store = makeBlockManager(12000)
+    val list = List.fill(2)(new Array[Byte](2000))
+    val bigList = List.fill(8)(new Array[Byte](2000))
 
     // 1 updated block (i.e. list1)
     val updatedBlocks1 =
-      store.put("list1", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store.putIterator("list1", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     assert(updatedBlocks1.size === 1)
     assert(updatedBlocks1.head._1 === TestBlockId("list1"))
     assert(updatedBlocks1.head._2.storageLevel === StorageLevel.MEMORY_ONLY)
 
     // 1 updated block (i.e. list2)
     val updatedBlocks2 =
-      store.put("list2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
+      store.putIterator("list2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
     assert(updatedBlocks2.size === 1)
     assert(updatedBlocks2.head._1 === TestBlockId("list2"))
     assert(updatedBlocks2.head._2.storageLevel === StorageLevel.MEMORY_ONLY)
 
     // 2 updated blocks - list1 is kicked out of memory while list3 is added
     val updatedBlocks3 =
-      store.put("list3", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store.putIterator("list3", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     assert(updatedBlocks3.size === 2)
     updatedBlocks3.foreach { case (id, status) =>
       id match {
@@ -903,11 +878,11 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
         case _ => fail("Updated block is neither list1 nor list3")
       }
     }
-    assert(store.get("list3").isDefined, "list3 was not in store")
+    assert(store.memoryStore.contains("list3"), "list3 was not in memory store")
 
     // 2 updated blocks - list2 is kicked out of memory (but put on disk) while list4 is added
     val updatedBlocks4 =
-      store.put("list4", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store.putIterator("list4", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     assert(updatedBlocks4.size === 2)
     updatedBlocks4.foreach { case (id, status) =>
       id match {
@@ -916,26 +891,37 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
         case _ => fail("Updated block is neither list2 nor list4")
       }
     }
-    assert(store.get("list4").isDefined, "list4 was not in store")
+    assert(store.diskStore.contains("list2"), "list2 was not in disk store")
+    assert(store.memoryStore.contains("list4"), "list4 was not in memory store")
 
-    // No updated blocks - nothing is kicked out of memory because list5 is too big to be added
+    // No updated blocks - list5 is too big to fit in store and nothing is kicked out
     val updatedBlocks5 =
-      store.put("list5", bigList.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store.putIterator("list5", bigList.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     assert(updatedBlocks5.size === 0)
-    assert(store.get("list2").isDefined, "list2 was not in store")
-    assert(store.get("list4").isDefined, "list4 was not in store")
-    assert(!store.get("list5").isDefined, "list5 was in store")
+
+    // memory store contains only list3 and list4
+    assert(!store.memoryStore.contains("list1"), "list1 was in memory store")
+    assert(!store.memoryStore.contains("list2"), "list2 was in memory store")
+    assert(store.memoryStore.contains("list3"), "list3 was not in memory store")
+    assert(store.memoryStore.contains("list4"), "list4 was not in memory store")
+    assert(!store.memoryStore.contains("list5"), "list5 was in memory store")
+
+    // disk store contains only list2
+    assert(!store.diskStore.contains("list1"), "list1 was in disk store")
+    assert(store.diskStore.contains("list2"), "list2 was not in disk store")
+    assert(!store.diskStore.contains("list3"), "list3 was in disk store")
+    assert(!store.diskStore.contains("list4"), "list4 was in disk store")
+    assert(!store.diskStore.contains("list5"), "list5 was in disk store")
   }
 
   test("query block statuses") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val list = List.fill(2)(new Array[Byte](200))
+    store = makeBlockManager(12000)
+    val list = List.fill(2)(new Array[Byte](2000))
 
     // Tell master. By LRU, only list2 and list3 remains.
-    store.put("list1", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    store.put("list2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
-    store.put("list3", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list1", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+    store.putIterator("list2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
+    store.putIterator("list3", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
 
     // getLocations and getBlockStatus should yield the same locations
     assert(store.master.getLocations("list1").size === 0)
@@ -949,9 +935,9 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
     assert(store.master.getBlockStatus("list3", askSlaves = true).size === 1)
 
     // This time don't tell master and see what happens. By LRU, only list5 and list6 remains.
-    store.put("list4", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = false)
-    store.put("list5", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
-    store.put("list6", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = false)
+    store.putIterator("list4", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = false)
+    store.putIterator("list5", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
+    store.putIterator("list6", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = false)
 
     // getLocations should return nothing because the master is not informed
     // getBlockStatus without asking slaves should have the same result
@@ -968,23 +954,22 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("get matching blocks") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    val list = List.fill(2)(new Array[Byte](10))
+    store = makeBlockManager(12000)
+    val list = List.fill(2)(new Array[Byte](100))
 
     // insert some blocks
-    store.put("list1", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
-    store.put("list2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
-    store.put("list3", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
+    store.putIterator("list1", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
+    store.putIterator("list2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
+    store.putIterator("list3", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
 
     // getLocations and getBlockStatus should yield the same locations
     assert(store.master.getMatchingBlockIds(_.toString.contains("list"), askSlaves = false).size === 3)
     assert(store.master.getMatchingBlockIds(_.toString.contains("list1"), askSlaves = false).size === 1)
 
     // insert some more blocks
-    store.put("newlist1", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
-    store.put("newlist2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
-    store.put("newlist3", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
+    store.putIterator("newlist1", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = true)
+    store.putIterator("newlist2", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
+    store.putIterator("newlist3", list.iterator, StorageLevel.MEMORY_AND_DISK, tellMaster = false)
 
     // getLocations and getBlockStatus should yield the same locations
     assert(store.master.getMatchingBlockIds(_.toString.contains("newlist"), askSlaves = false).size === 1)
@@ -992,7 +977,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
 
     val blockIds = Seq(RDDBlockId(1, 0), RDDBlockId(1, 1), RDDBlockId(2, 0))
     blockIds.foreach { blockId =>
-      store.put(blockId, list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store.putIterator(blockId, list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     }
     val matchedBlockIds = store.master.getMatchingBlockIds(_ match {
       case RDDBlockId(1, _) => true
@@ -1002,17 +987,240 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("SPARK-1194 regression: fix the same-RDD rule for cache replacement") {
-    store = new BlockManager("<driver>", actorSystem, master, serializer, 1200, conf,
-      securityMgr, mapOutputTracker)
-    store.putSingle(rdd(0, 0), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
-    store.putSingle(rdd(1, 0), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
+    store = makeBlockManager(12000)
+    store.putSingle(rdd(0, 0), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
+    store.putSingle(rdd(1, 0), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
     // Access rdd_1_0 to ensure it's not least recently used.
     assert(store.getSingle(rdd(1, 0)).isDefined, "rdd_1_0 was not in store")
     // According to the same-RDD rule, rdd_1_0 should be replaced here.
-    store.putSingle(rdd(0, 1), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
+    store.putSingle(rdd(0, 1), new Array[Byte](4000), StorageLevel.MEMORY_ONLY)
     // rdd_1_0 should have been replaced, even it's not least recently used.
     assert(store.memoryStore.contains(rdd(0, 0)), "rdd_0_0 was not in store")
     assert(store.memoryStore.contains(rdd(0, 1)), "rdd_0_1 was not in store")
     assert(!store.memoryStore.contains(rdd(1, 0)), "rdd_1_0 was in store")
   }
+
+  test("reserve/release unroll memory") {
+    store = makeBlockManager(12000)
+    val memoryStore = store.memoryStore
+    assert(memoryStore.currentUnrollMemory === 0)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Reserve
+    memoryStore.reserveUnrollMemoryForThisThread(100)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 100)
+    memoryStore.reserveUnrollMemoryForThisThread(200)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 300)
+    memoryStore.reserveUnrollMemoryForThisThread(500)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 800)
+    memoryStore.reserveUnrollMemoryForThisThread(1000000)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 800) // not granted
+    // Release
+    memoryStore.releaseUnrollMemoryForThisThread(100)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 700)
+    memoryStore.releaseUnrollMemoryForThisThread(100)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 600)
+    // Reserve again
+    memoryStore.reserveUnrollMemoryForThisThread(4400)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 5000)
+    memoryStore.reserveUnrollMemoryForThisThread(20000)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 5000) // not granted
+    // Release again
+    memoryStore.releaseUnrollMemoryForThisThread(1000)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 4000)
+    memoryStore.releaseUnrollMemoryForThisThread() // release all
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+  }
+
+  /**
+   * Verify the result of MemoryStore#unrollSafely is as expected.
+   */
+  private def verifyUnroll(
+      expected: Iterator[Any],
+      result: Either[Array[Any], Iterator[Any]],
+      shouldBeArray: Boolean): Unit = {
+    val actual: Iterator[Any] = result match {
+      case Left(arr: Array[Any]) =>
+        assert(shouldBeArray, "expected iterator from unroll!")
+        arr.iterator
+      case Right(it: Iterator[Any]) =>
+        assert(!shouldBeArray, "expected array from unroll!")
+        it
+      case _ =>
+        fail("unroll returned neither an iterator nor an array...")
+    }
+    expected.zip(actual).foreach { case (e, a) =>
+      assert(e === a, "unroll did not return original values!")
+    }
+  }
+
+  test("safely unroll blocks") {
+    store = makeBlockManager(12000)
+    val smallList = List.fill(40)(new Array[Byte](100))
+    val bigList = List.fill(40)(new Array[Byte](1000))
+    val memoryStore = store.memoryStore
+    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Unroll with all the space in the world. This should succeed and return an array.
+    var unrollResult = memoryStore.unrollSafely("unroll", smallList.iterator, droppedBlocks)
+    verifyUnroll(smallList.iterator, unrollResult, shouldBeArray = true)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Unroll with not enough space. This should succeed after kicking out someBlock1.
+    store.putIterator("someBlock1", smallList.iterator, StorageLevel.MEMORY_ONLY)
+    store.putIterator("someBlock2", smallList.iterator, StorageLevel.MEMORY_ONLY)
+    unrollResult = memoryStore.unrollSafely("unroll", smallList.iterator, droppedBlocks)
+    verifyUnroll(smallList.iterator, unrollResult, shouldBeArray = true)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(droppedBlocks.size === 1)
+    assert(droppedBlocks.head._1 === TestBlockId("someBlock1"))
+    droppedBlocks.clear()
+
+    // Unroll huge block with not enough space. Even after ensuring free space of 12000 * 0.4 =
+    // 4800 bytes, there is still not enough room to unroll this block. This returns an iterator.
+    // In the mean time, however, we kicked out someBlock2 before giving up.
+    store.putIterator("someBlock3", smallList.iterator, StorageLevel.MEMORY_ONLY)
+    unrollResult = memoryStore.unrollSafely("unroll", bigList.iterator, droppedBlocks)
+    verifyUnroll(bigList.iterator, unrollResult, shouldBeArray = false)
+    assert(memoryStore.currentUnrollMemoryForThisThread > 0) // we returned an iterator
+    assert(droppedBlocks.size === 1)
+    assert(droppedBlocks.head._1 === TestBlockId("someBlock2"))
+    droppedBlocks.clear()
+  }
+
+  test("safely unroll blocks through putIterator") {
+    store = makeBlockManager(12000)
+    val memOnly = StorageLevel.MEMORY_ONLY
+    val memoryStore = store.memoryStore
+    val smallList = List.fill(40)(new Array[Byte](100))
+    val bigList = List.fill(40)(new Array[Byte](1000))
+    def smallIterator = smallList.iterator.asInstanceOf[Iterator[Any]]
+    def bigIterator = bigList.iterator.asInstanceOf[Iterator[Any]]
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Unroll with plenty of space. This should succeed and cache both blocks.
+    val result1 = memoryStore.putIterator("b1", smallIterator, memOnly, returnValues = true)
+    val result2 = memoryStore.putIterator("b2", smallIterator, memOnly, returnValues = true)
+    assert(memoryStore.contains("b1"))
+    assert(memoryStore.contains("b2"))
+    assert(result1.size > 0) // unroll was successful
+    assert(result2.size > 0)
+    assert(result1.data.isLeft) // unroll did not drop this block to disk
+    assert(result2.data.isLeft)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Re-put these two blocks so block manager knows about them too. Otherwise, block manager
+    // would not know how to drop them from memory later.
+    memoryStore.remove("b1")
+    memoryStore.remove("b2")
+    store.putIterator("b1", smallIterator, memOnly)
+    store.putIterator("b2", smallIterator, memOnly)
+
+    // Unroll with not enough space. This should succeed but kick out b1 in the process.
+    val result3 = memoryStore.putIterator("b3", smallIterator, memOnly, returnValues = true)
+    assert(result3.size > 0)
+    assert(result3.data.isLeft)
+    assert(!memoryStore.contains("b1"))
+    assert(memoryStore.contains("b2"))
+    assert(memoryStore.contains("b3"))
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    memoryStore.remove("b3")
+    store.putIterator("b3", smallIterator, memOnly)
+
+    // Unroll huge block with not enough space. This should fail and kick out b2 in the process.
+    val result4 = memoryStore.putIterator("b4", bigIterator, memOnly, returnValues = true)
+    assert(result4.size === 0) // unroll was unsuccessful
+    assert(result4.data.isLeft)
+    assert(!memoryStore.contains("b1"))
+    assert(!memoryStore.contains("b2"))
+    assert(memoryStore.contains("b3"))
+    assert(!memoryStore.contains("b4"))
+    assert(memoryStore.currentUnrollMemoryForThisThread > 0) // we returned an iterator
+  }
+
+  /**
+   * This test is essentially identical to the preceding one, except that it uses MEMORY_AND_DISK.
+   */
+  test("safely unroll blocks through putIterator (disk)") {
+    store = makeBlockManager(12000)
+    val memAndDisk = StorageLevel.MEMORY_AND_DISK
+    val memoryStore = store.memoryStore
+    val diskStore = store.diskStore
+    val smallList = List.fill(40)(new Array[Byte](100))
+    val bigList = List.fill(40)(new Array[Byte](1000))
+    def smallIterator = smallList.iterator.asInstanceOf[Iterator[Any]]
+    def bigIterator = bigList.iterator.asInstanceOf[Iterator[Any]]
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    store.putIterator("b1", smallIterator, memAndDisk)
+    store.putIterator("b2", smallIterator, memAndDisk)
+
+    // Unroll with not enough space. This should succeed but kick out b1 in the process.
+    // Memory store should contain b2 and b3, while disk store should contain only b1
+    val result3 = memoryStore.putIterator("b3", smallIterator, memAndDisk, returnValues = true)
+    assert(result3.size > 0)
+    assert(!memoryStore.contains("b1"))
+    assert(memoryStore.contains("b2"))
+    assert(memoryStore.contains("b3"))
+    assert(diskStore.contains("b1"))
+    assert(!diskStore.contains("b2"))
+    assert(!diskStore.contains("b3"))
+    memoryStore.remove("b3")
+    store.putIterator("b3", smallIterator, StorageLevel.MEMORY_ONLY)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Unroll huge block with not enough space. This should fail and drop the new block to disk
+    // directly in addition to kicking out b2 in the process. Memory store should contain only
+    // b3, while disk store should contain b1, b2 and b4.
+    val result4 = memoryStore.putIterator("b4", bigIterator, memAndDisk, returnValues = true)
+    assert(result4.size > 0)
+    assert(result4.data.isRight) // unroll returned bytes from disk
+    assert(!memoryStore.contains("b1"))
+    assert(!memoryStore.contains("b2"))
+    assert(memoryStore.contains("b3"))
+    assert(!memoryStore.contains("b4"))
+    assert(diskStore.contains("b1"))
+    assert(diskStore.contains("b2"))
+    assert(!diskStore.contains("b3"))
+    assert(diskStore.contains("b4"))
+    assert(memoryStore.currentUnrollMemoryForThisThread > 0) // we returned an iterator
+  }
+
+  test("multiple unrolls by the same thread") {
+    store = makeBlockManager(12000)
+    val memOnly = StorageLevel.MEMORY_ONLY
+    val memoryStore = store.memoryStore
+    val smallList = List.fill(40)(new Array[Byte](100))
+    def smallIterator = smallList.iterator.asInstanceOf[Iterator[Any]]
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // All unroll memory used is released because unrollSafely returned an array
+    memoryStore.putIterator("b1", smallIterator, memOnly, returnValues = true)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    memoryStore.putIterator("b2", smallIterator, memOnly, returnValues = true)
+    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+
+    // Unroll memory is not released because unrollSafely returned an iterator
+    // that still depends on the underlying vector used in the process
+    memoryStore.putIterator("b3", smallIterator, memOnly, returnValues = true)
+    val unrollMemoryAfterB3 = memoryStore.currentUnrollMemoryForThisThread
+    assert(unrollMemoryAfterB3 > 0)
+
+    // The unroll memory owned by this thread builds on top of its value after the previous unrolls
+    memoryStore.putIterator("b4", smallIterator, memOnly, returnValues = true)
+    val unrollMemoryAfterB4 = memoryStore.currentUnrollMemoryForThisThread
+    assert(unrollMemoryAfterB4 > unrollMemoryAfterB3)
+
+    // ... but only to a certain extent (until we run out of free space to grant new unroll memory)
+    memoryStore.putIterator("b5", smallIterator, memOnly, returnValues = true)
+    val unrollMemoryAfterB5 = memoryStore.currentUnrollMemoryForThisThread
+    memoryStore.putIterator("b6", smallIterator, memOnly, returnValues = true)
+    val unrollMemoryAfterB6 = memoryStore.currentUnrollMemoryForThisThread
+    memoryStore.putIterator("b7", smallIterator, memOnly, returnValues = true)
+    val unrollMemoryAfterB7 = memoryStore.currentUnrollMemoryForThisThread
+    assert(unrollMemoryAfterB5 === unrollMemoryAfterB4)
+    assert(unrollMemoryAfterB6 === unrollMemoryAfterB4)
+    assert(unrollMemoryAfterB7 === unrollMemoryAfterB4)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
index aaa7714049732..985ac9394738c 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
@@ -22,11 +22,14 @@ import java.io.{File, FileWriter}
 import scala.collection.mutable
 import scala.language.reflectiveCalls
 
+import akka.actor.Props
 import com.google.common.io.Files
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
 
 import org.apache.spark.SparkConf
-import org.apache.spark.util.Utils
+import org.apache.spark.scheduler.LiveListenerBus
+import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.util.{AkkaUtils, Utils}
 
 class DiskBlockManagerSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
   private val testConf = new SparkConf(false)
@@ -121,6 +124,88 @@ class DiskBlockManagerSuite extends FunSuite with BeforeAndAfterEach with Before
     newFile.delete()
   }
 
+  private def checkSegments(segment1: FileSegment, segment2: FileSegment) {
+    assert (segment1.file.getCanonicalPath === segment2.file.getCanonicalPath)
+    assert (segment1.offset === segment2.offset)
+    assert (segment1.length === segment2.length)
+  }
+
+  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {
+
+    val serializer = new JavaSerializer(testConf)
+    val confCopy = testConf.clone
+    // reset after EACH object write. This is to ensure that there are bytes appended after
+    // an object is written. So if the codepaths assume writeObject is end of data, this should
+    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
+    confCopy.set("spark.serializer.objectStreamReset", "1")
+
+    val securityManager = new org.apache.spark.SecurityManager(confCopy)
+    // Do not use the shuffleBlockManager above !
+    val (actorSystem, boundPort) = AkkaUtils.createActorSystem("test", "localhost", 0, confCopy,
+      securityManager)
+    val master = new BlockManagerMaster(
+      actorSystem.actorOf(Props(new BlockManagerMasterActor(true, confCopy, new LiveListenerBus))),
+      confCopy)
+    val store = new BlockManager("<driver>", actorSystem, master , serializer, confCopy,
+      securityManager, null)
+
+    try {
+
+      val shuffleManager = store.shuffleBlockManager
+
+      val shuffle1 = shuffleManager.forMapTask(1, 1, 1, serializer)
+      for (writer <- shuffle1.writers) {
+        writer.write("test1")
+        writer.write("test2")
+      }
+      for (writer <- shuffle1.writers) {
+        writer.commitAndClose()
+      }
+
+      val shuffle1Segment = shuffle1.writers(0).fileSegment()
+      shuffle1.releaseWriters(success = true)
+
+      val shuffle2 = shuffleManager.forMapTask(1, 2, 1, new JavaSerializer(testConf))
+
+      for (writer <- shuffle2.writers) {
+        writer.write("test3")
+        writer.write("test4")
+      }
+      for (writer <- shuffle2.writers) {
+        writer.commitAndClose()
+      }
+      val shuffle2Segment = shuffle2.writers(0).fileSegment()
+      shuffle2.releaseWriters(success = true)
+
+      // Now comes the test :
+      // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
+      // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
+      // of block based on remaining data in file : which could mess things up when there is concurrent read
+      // and writes happening to the same shuffle group.
+
+      val shuffle3 = shuffleManager.forMapTask(1, 3, 1, new JavaSerializer(testConf))
+      for (writer <- shuffle3.writers) {
+        writer.write("test3")
+        writer.write("test4")
+      }
+      for (writer <- shuffle3.writers) {
+        writer.commitAndClose()
+      }
+      // check before we register.
+      checkSegments(shuffle2Segment, shuffleManager.getBlockLocation(ShuffleBlockId(1, 2, 0)))
+      shuffle3.releaseWriters(success = true)
+      checkSegments(shuffle2Segment, shuffleManager.getBlockLocation(ShuffleBlockId(1, 2, 0)))
+      shuffleManager.removeShuffle(1)
+    } finally {
+
+      if (store != null) {
+        store.stop()
+      }
+      actorSystem.shutdown()
+      actorSystem.awaitTermination()
+    }
+  }
+
   def assertSegmentEquals(blockId: BlockId, filename: String, offset: Int, length: Int) {
     val segment = diskBlockManager.getBlockLocation(blockId)
     assert(segment.file.getName === filename)
diff --git a/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala
index 2179c6dd3302e..51fb646a3cb61 100644
--- a/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala
@@ -41,13 +41,13 @@ class StorageStatusListenerSuite extends FunSuite {
     assert(listener.executorIdToStorageStatus.get("big").isDefined)
     assert(listener.executorIdToStorageStatus("big").blockManagerId === bm1)
     assert(listener.executorIdToStorageStatus("big").maxMem === 1000L)
-    assert(listener.executorIdToStorageStatus("big").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 0)
     listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(bm2, 2000L))
     assert(listener.executorIdToStorageStatus.size === 2)
     assert(listener.executorIdToStorageStatus.get("fat").isDefined)
     assert(listener.executorIdToStorageStatus("fat").blockManagerId === bm2)
     assert(listener.executorIdToStorageStatus("fat").maxMem === 2000L)
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
 
     // Block manager remove
     listener.onBlockManagerRemoved(SparkListenerBlockManagerRemoved(bm1))
@@ -67,14 +67,14 @@ class StorageStatusListenerSuite extends FunSuite {
     val taskMetrics = new TaskMetrics
 
     // Task end with no updated blocks
-    assert(listener.executorIdToStorageStatus("big").blocks.isEmpty)
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 0)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo1, taskMetrics))
-    assert(listener.executorIdToStorageStatus("big").blocks.isEmpty)
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 0)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo2, taskMetrics))
-    assert(listener.executorIdToStorageStatus("big").blocks.isEmpty)
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 0)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
   }
 
   test("task end with updated blocks") {
@@ -90,20 +90,20 @@ class StorageStatusListenerSuite extends FunSuite {
     taskMetrics2.updatedBlocks = Some(Seq(block3))
 
     // Task end with new blocks
-    assert(listener.executorIdToStorageStatus("big").blocks.isEmpty)
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 0)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo1, taskMetrics1))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 2)
-    assert(listener.executorIdToStorageStatus("fat").blocks.size === 0)
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 1)))
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 2)))
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 2)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1)))
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2)))
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo2, taskMetrics2))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 2)
-    assert(listener.executorIdToStorageStatus("fat").blocks.size === 1)
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 1)))
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 2)))
-    assert(listener.executorIdToStorageStatus("fat").blocks.contains(RDDBlockId(4, 0)))
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 2)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 1)
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1)))
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2)))
+    assert(listener.executorIdToStorageStatus("fat").containsBlock(RDDBlockId(4, 0)))
 
     // Task end with dropped blocks
     val droppedBlock1 = (RDDBlockId(1, 1), BlockStatus(StorageLevel.NONE, 0L, 0L, 0L))
@@ -112,17 +112,17 @@ class StorageStatusListenerSuite extends FunSuite {
     taskMetrics1.updatedBlocks = Some(Seq(droppedBlock1, droppedBlock3))
     taskMetrics2.updatedBlocks = Some(Seq(droppedBlock2, droppedBlock3))
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo1, taskMetrics1))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 1)
-    assert(listener.executorIdToStorageStatus("fat").blocks.size === 1)
-    assert(!listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 1)))
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 2)))
-    assert(listener.executorIdToStorageStatus("fat").blocks.contains(RDDBlockId(4, 0)))
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 1)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 1)
+    assert(!listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1)))
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2)))
+    assert(listener.executorIdToStorageStatus("fat").containsBlock(RDDBlockId(4, 0)))
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo2, taskMetrics2))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 1)
-    assert(listener.executorIdToStorageStatus("fat").blocks.size === 0)
-    assert(!listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 1)))
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 2)))
-    assert(listener.executorIdToStorageStatus("fat").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 1)
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
+    assert(!listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1)))
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2)))
+    assert(listener.executorIdToStorageStatus("fat").numBlocks === 0)
   }
 
   test("unpersist RDD") {
@@ -137,16 +137,16 @@ class StorageStatusListenerSuite extends FunSuite {
     taskMetrics2.updatedBlocks = Some(Seq(block3))
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo1, taskMetrics1))
     listener.onTaskEnd(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo1, taskMetrics2))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 3)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 3)
 
     // Unpersist RDD
     listener.onUnpersistRDD(SparkListenerUnpersistRDD(9090))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 3)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 3)
     listener.onUnpersistRDD(SparkListenerUnpersistRDD(4))
-    assert(listener.executorIdToStorageStatus("big").blocks.size === 2)
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 1)))
-    assert(listener.executorIdToStorageStatus("big").blocks.contains(RDDBlockId(1, 2)))
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 2)
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1)))
+    assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2)))
     listener.onUnpersistRDD(SparkListenerUnpersistRDD(1))
-    assert(listener.executorIdToStorageStatus("big").blocks.isEmpty)
+    assert(listener.executorIdToStorageStatus("big").numBlocks === 0)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
new file mode 100644
index 0000000000000..38678bbd1dd28
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
@@ -0,0 +1,354 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import org.scalatest.FunSuite
+
+/**
+ * Test various functionalities in StorageUtils and StorageStatus.
+ */
+class StorageSuite extends FunSuite {
+  private val memAndDisk = StorageLevel.MEMORY_AND_DISK
+
+  // For testing add, update, and remove (for non-RDD blocks)
+  private def storageStatus1: StorageStatus = {
+    val status = new StorageStatus(BlockManagerId("big", "dog", 1, 1), 1000L)
+    assert(status.blocks.isEmpty)
+    assert(status.rddBlocks.isEmpty)
+    assert(status.memUsed === 0L)
+    assert(status.memRemaining === 1000L)
+    assert(status.diskUsed === 0L)
+    assert(status.offHeapUsed === 0L)
+    status.addBlock(TestBlockId("foo"), BlockStatus(memAndDisk, 10L, 20L, 1L))
+    status.addBlock(TestBlockId("fee"), BlockStatus(memAndDisk, 10L, 20L, 1L))
+    status.addBlock(TestBlockId("faa"), BlockStatus(memAndDisk, 10L, 20L, 1L))
+    status
+  }
+
+  test("storage status add non-RDD blocks") {
+    val status = storageStatus1
+    assert(status.blocks.size === 3)
+    assert(status.blocks.contains(TestBlockId("foo")))
+    assert(status.blocks.contains(TestBlockId("fee")))
+    assert(status.blocks.contains(TestBlockId("faa")))
+    assert(status.rddBlocks.isEmpty)
+    assert(status.memUsed === 30L)
+    assert(status.memRemaining === 970L)
+    assert(status.diskUsed === 60L)
+    assert(status.offHeapUsed === 3L)
+  }
+
+  test("storage status update non-RDD blocks") {
+    val status = storageStatus1
+    status.updateBlock(TestBlockId("foo"), BlockStatus(memAndDisk, 50L, 100L, 1L))
+    status.updateBlock(TestBlockId("fee"), BlockStatus(memAndDisk, 100L, 20L, 0L))
+    assert(status.blocks.size === 3)
+    assert(status.memUsed === 160L)
+    assert(status.memRemaining === 840L)
+    assert(status.diskUsed === 140L)
+    assert(status.offHeapUsed === 2L)
+  }
+
+  test("storage status remove non-RDD blocks") {
+    val status = storageStatus1
+    status.removeBlock(TestBlockId("foo"))
+    status.removeBlock(TestBlockId("faa"))
+    assert(status.blocks.size === 1)
+    assert(status.blocks.contains(TestBlockId("fee")))
+    assert(status.memUsed === 10L)
+    assert(status.memRemaining === 990L)
+    assert(status.diskUsed === 20L)
+    assert(status.offHeapUsed === 1L)
+  }
+
+  // For testing add, update, remove, get, and contains etc. for both RDD and non-RDD blocks
+  private def storageStatus2: StorageStatus = {
+    val status = new StorageStatus(BlockManagerId("big", "dog", 1, 1), 1000L)
+    assert(status.rddBlocks.isEmpty)
+    status.addBlock(TestBlockId("dan"), BlockStatus(memAndDisk, 10L, 20L, 0L))
+    status.addBlock(TestBlockId("man"), BlockStatus(memAndDisk, 10L, 20L, 0L))
+    status.addBlock(RDDBlockId(0, 0), BlockStatus(memAndDisk, 10L, 20L, 1L))
+    status.addBlock(RDDBlockId(1, 1), BlockStatus(memAndDisk, 100L, 200L, 1L))
+    status.addBlock(RDDBlockId(2, 2), BlockStatus(memAndDisk, 10L, 20L, 1L))
+    status.addBlock(RDDBlockId(2, 3), BlockStatus(memAndDisk, 10L, 20L, 0L))
+    status.addBlock(RDDBlockId(2, 4), BlockStatus(memAndDisk, 10L, 40L, 0L))
+    status
+  }
+
+  test("storage status add RDD blocks") {
+    val status = storageStatus2
+    assert(status.blocks.size === 7)
+    assert(status.rddBlocks.size === 5)
+    assert(status.rddBlocks.contains(RDDBlockId(0, 0)))
+    assert(status.rddBlocks.contains(RDDBlockId(1, 1)))
+    assert(status.rddBlocks.contains(RDDBlockId(2, 2)))
+    assert(status.rddBlocks.contains(RDDBlockId(2, 3)))
+    assert(status.rddBlocks.contains(RDDBlockId(2, 4)))
+    assert(status.rddBlocksById(0).size === 1)
+    assert(status.rddBlocksById(0).contains(RDDBlockId(0, 0)))
+    assert(status.rddBlocksById(1).size === 1)
+    assert(status.rddBlocksById(1).contains(RDDBlockId(1, 1)))
+    assert(status.rddBlocksById(2).size === 3)
+    assert(status.rddBlocksById(2).contains(RDDBlockId(2, 2)))
+    assert(status.rddBlocksById(2).contains(RDDBlockId(2, 3)))
+    assert(status.rddBlocksById(2).contains(RDDBlockId(2, 4)))
+    assert(status.memUsedByRdd(0) === 10L)
+    assert(status.memUsedByRdd(1) === 100L)
+    assert(status.memUsedByRdd(2) === 30L)
+    assert(status.diskUsedByRdd(0) === 20L)
+    assert(status.diskUsedByRdd(1) === 200L)
+    assert(status.diskUsedByRdd(2) === 80L)
+    assert(status.offHeapUsedByRdd(0) === 1L)
+    assert(status.offHeapUsedByRdd(1) === 1L)
+    assert(status.offHeapUsedByRdd(2) === 1L)
+    assert(status.rddStorageLevel(0) === Some(memAndDisk))
+    assert(status.rddStorageLevel(1) === Some(memAndDisk))
+    assert(status.rddStorageLevel(2) === Some(memAndDisk))
+
+    // Verify default values for RDDs that don't exist
+    assert(status.rddBlocksById(10).isEmpty)
+    assert(status.memUsedByRdd(10) === 0L)
+    assert(status.diskUsedByRdd(10) === 0L)
+    assert(status.offHeapUsedByRdd(10) === 0L)
+    assert(status.rddStorageLevel(10) === None)
+  }
+
+  test("storage status update RDD blocks") {
+    val status = storageStatus2
+    status.updateBlock(TestBlockId("dan"), BlockStatus(memAndDisk, 5000L, 0L, 0L))
+    status.updateBlock(RDDBlockId(0, 0), BlockStatus(memAndDisk, 0L, 0L, 0L))
+    status.updateBlock(RDDBlockId(2, 2), BlockStatus(memAndDisk, 0L, 1000L, 0L))
+    assert(status.blocks.size === 7)
+    assert(status.rddBlocks.size === 5)
+    assert(status.rddBlocksById(0).size === 1)
+    assert(status.rddBlocksById(1).size === 1)
+    assert(status.rddBlocksById(2).size === 3)
+    assert(status.memUsedByRdd(0) === 0L)
+    assert(status.memUsedByRdd(1) === 100L)
+    assert(status.memUsedByRdd(2) === 20L)
+    assert(status.diskUsedByRdd(0) === 0L)
+    assert(status.diskUsedByRdd(1) === 200L)
+    assert(status.diskUsedByRdd(2) === 1060L)
+    assert(status.offHeapUsedByRdd(0) === 0L)
+    assert(status.offHeapUsedByRdd(1) === 1L)
+    assert(status.offHeapUsedByRdd(2) === 0L)
+  }
+
+  test("storage status remove RDD blocks") {
+    val status = storageStatus2
+    status.removeBlock(TestBlockId("man"))
+    status.removeBlock(RDDBlockId(1, 1))
+    status.removeBlock(RDDBlockId(2, 2))
+    status.removeBlock(RDDBlockId(2, 4))
+    assert(status.blocks.size === 3)
+    assert(status.rddBlocks.size === 2)
+    assert(status.rddBlocks.contains(RDDBlockId(0, 0)))
+    assert(status.rddBlocks.contains(RDDBlockId(2, 3)))
+    assert(status.rddBlocksById(0).size === 1)
+    assert(status.rddBlocksById(0).contains(RDDBlockId(0, 0)))
+    assert(status.rddBlocksById(1).size === 0)
+    assert(status.rddBlocksById(2).size === 1)
+    assert(status.rddBlocksById(2).contains(RDDBlockId(2, 3)))
+    assert(status.memUsedByRdd(0) === 10L)
+    assert(status.memUsedByRdd(1) === 0L)
+    assert(status.memUsedByRdd(2) === 10L)
+    assert(status.diskUsedByRdd(0) === 20L)
+    assert(status.diskUsedByRdd(1) === 0L)
+    assert(status.diskUsedByRdd(2) === 20L)
+    assert(status.offHeapUsedByRdd(0) === 1L)
+    assert(status.offHeapUsedByRdd(1) === 0L)
+    assert(status.offHeapUsedByRdd(2) === 0L)
+  }
+
+  test("storage status containsBlock") {
+    val status = storageStatus2
+    // blocks that actually exist
+    assert(status.blocks.contains(TestBlockId("dan")) === status.containsBlock(TestBlockId("dan")))
+    assert(status.blocks.contains(TestBlockId("man")) === status.containsBlock(TestBlockId("man")))
+    assert(status.blocks.contains(RDDBlockId(0, 0)) === status.containsBlock(RDDBlockId(0, 0)))
+    assert(status.blocks.contains(RDDBlockId(1, 1)) === status.containsBlock(RDDBlockId(1, 1)))
+    assert(status.blocks.contains(RDDBlockId(2, 2)) === status.containsBlock(RDDBlockId(2, 2)))
+    assert(status.blocks.contains(RDDBlockId(2, 3)) === status.containsBlock(RDDBlockId(2, 3)))
+    assert(status.blocks.contains(RDDBlockId(2, 4)) === status.containsBlock(RDDBlockId(2, 4)))
+    // blocks that don't exist
+    assert(status.blocks.contains(TestBlockId("fan")) === status.containsBlock(TestBlockId("fan")))
+    assert(status.blocks.contains(RDDBlockId(100, 0)) === status.containsBlock(RDDBlockId(100, 0)))
+  }
+
+  test("storage status getBlock") {
+    val status = storageStatus2
+    // blocks that actually exist
+    assert(status.blocks.get(TestBlockId("dan")) === status.getBlock(TestBlockId("dan")))
+    assert(status.blocks.get(TestBlockId("man")) === status.getBlock(TestBlockId("man")))
+    assert(status.blocks.get(RDDBlockId(0, 0)) === status.getBlock(RDDBlockId(0, 0)))
+    assert(status.blocks.get(RDDBlockId(1, 1)) === status.getBlock(RDDBlockId(1, 1)))
+    assert(status.blocks.get(RDDBlockId(2, 2)) === status.getBlock(RDDBlockId(2, 2)))
+    assert(status.blocks.get(RDDBlockId(2, 3)) === status.getBlock(RDDBlockId(2, 3)))
+    assert(status.blocks.get(RDDBlockId(2, 4)) === status.getBlock(RDDBlockId(2, 4)))
+    // blocks that don't exist
+    assert(status.blocks.get(TestBlockId("fan")) === status.getBlock(TestBlockId("fan")))
+    assert(status.blocks.get(RDDBlockId(100, 0)) === status.getBlock(RDDBlockId(100, 0)))
+  }
+
+  test("storage status num[Rdd]Blocks") {
+    val status = storageStatus2
+    assert(status.blocks.size === status.numBlocks)
+    assert(status.rddBlocks.size === status.numRddBlocks)
+    status.addBlock(TestBlockId("Foo"), BlockStatus(memAndDisk, 0L, 0L, 100L))
+    status.addBlock(RDDBlockId(4, 4), BlockStatus(memAndDisk, 0L, 0L, 100L))
+    status.addBlock(RDDBlockId(4, 8), BlockStatus(memAndDisk, 0L, 0L, 100L))
+    assert(status.blocks.size === status.numBlocks)
+    assert(status.rddBlocks.size === status.numRddBlocks)
+    assert(status.rddBlocksById(4).size === status.numRddBlocksById(4))
+    assert(status.rddBlocksById(10).size === status.numRddBlocksById(10))
+    status.updateBlock(TestBlockId("Foo"), BlockStatus(memAndDisk, 0L, 10L, 400L))
+    status.updateBlock(RDDBlockId(4, 0), BlockStatus(memAndDisk, 0L, 0L, 100L))
+    status.updateBlock(RDDBlockId(4, 8), BlockStatus(memAndDisk, 0L, 0L, 100L))
+    status.updateBlock(RDDBlockId(10, 10), BlockStatus(memAndDisk, 0L, 0L, 100L))
+    assert(status.blocks.size === status.numBlocks)
+    assert(status.rddBlocks.size === status.numRddBlocks)
+    assert(status.rddBlocksById(4).size === status.numRddBlocksById(4))
+    assert(status.rddBlocksById(10).size === status.numRddBlocksById(10))
+    assert(status.rddBlocksById(100).size === status.numRddBlocksById(100))
+    status.removeBlock(RDDBlockId(4, 0))
+    status.removeBlock(RDDBlockId(10, 10))
+    assert(status.blocks.size === status.numBlocks)
+    assert(status.rddBlocks.size === status.numRddBlocks)
+    assert(status.rddBlocksById(4).size === status.numRddBlocksById(4))
+    assert(status.rddBlocksById(10).size === status.numRddBlocksById(10))
+    // remove a block that doesn't exist
+    status.removeBlock(RDDBlockId(1000, 999))
+    assert(status.blocks.size === status.numBlocks)
+    assert(status.rddBlocks.size === status.numRddBlocks)
+    assert(status.rddBlocksById(4).size === status.numRddBlocksById(4))
+    assert(status.rddBlocksById(10).size === status.numRddBlocksById(10))
+    assert(status.rddBlocksById(1000).size === status.numRddBlocksById(1000))
+  }
+
+  test("storage status memUsed, diskUsed, tachyonUsed") {
+    val status = storageStatus2
+    def actualMemUsed: Long = status.blocks.values.map(_.memSize).sum
+    def actualDiskUsed: Long = status.blocks.values.map(_.diskSize).sum
+    def actualOffHeapUsed: Long = status.blocks.values.map(_.tachyonSize).sum
+    assert(status.memUsed === actualMemUsed)
+    assert(status.diskUsed === actualDiskUsed)
+    assert(status.offHeapUsed === actualOffHeapUsed)
+    status.addBlock(TestBlockId("fire"), BlockStatus(memAndDisk, 4000L, 5000L, 6000L))
+    status.addBlock(TestBlockId("wire"), BlockStatus(memAndDisk, 400L, 500L, 600L))
+    status.addBlock(RDDBlockId(25, 25), BlockStatus(memAndDisk, 40L, 50L, 60L))
+    assert(status.memUsed === actualMemUsed)
+    assert(status.diskUsed === actualDiskUsed)
+    assert(status.offHeapUsed === actualOffHeapUsed)
+    status.updateBlock(TestBlockId("dan"), BlockStatus(memAndDisk, 4L, 5L, 6L))
+    status.updateBlock(RDDBlockId(0, 0), BlockStatus(memAndDisk, 4L, 5L, 6L))
+    status.updateBlock(RDDBlockId(1, 1), BlockStatus(memAndDisk, 4L, 5L, 6L))
+    assert(status.memUsed === actualMemUsed)
+    assert(status.diskUsed === actualDiskUsed)
+    assert(status.offHeapUsed === actualOffHeapUsed)
+    status.removeBlock(TestBlockId("fire"))
+    status.removeBlock(TestBlockId("man"))
+    status.removeBlock(RDDBlockId(2, 2))
+    status.removeBlock(RDDBlockId(2, 3))
+    assert(status.memUsed === actualMemUsed)
+    assert(status.diskUsed === actualDiskUsed)
+    assert(status.offHeapUsed === actualOffHeapUsed)
+  }
+
+  // For testing StorageUtils.updateRddInfo and StorageUtils.getRddBlockLocations
+  private def stockStorageStatuses: Seq[StorageStatus] = {
+    val status1 = new StorageStatus(BlockManagerId("big", "dog", 1, 1), 1000L)
+    val status2 = new StorageStatus(BlockManagerId("fat", "duck", 2, 2), 2000L)
+    val status3 = new StorageStatus(BlockManagerId("fat", "cat", 3, 3), 3000L)
+    status1.addBlock(RDDBlockId(0, 0), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status1.addBlock(RDDBlockId(0, 1), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status2.addBlock(RDDBlockId(0, 2), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status2.addBlock(RDDBlockId(0, 3), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status2.addBlock(RDDBlockId(1, 0), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status2.addBlock(RDDBlockId(1, 1), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status3.addBlock(RDDBlockId(0, 4), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    status3.addBlock(RDDBlockId(1, 2), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    Seq(status1, status2, status3)
+  }
+
+  // For testing StorageUtils.updateRddInfo
+  private def stockRDDInfos: Seq[RDDInfo] = {
+    val info0 = new RDDInfo(0, "0", 10, memAndDisk)
+    val info1 = new RDDInfo(1, "1", 3, memAndDisk)
+    Seq(info0, info1)
+  }
+
+  test("StorageUtils.updateRddInfo") {
+    val storageStatuses = stockStorageStatuses
+    val rddInfos = stockRDDInfos
+    StorageUtils.updateRddInfo(rddInfos, storageStatuses)
+    assert(rddInfos(0).storageLevel === memAndDisk)
+    assert(rddInfos(0).numCachedPartitions === 5)
+    assert(rddInfos(0).memSize === 5L)
+    assert(rddInfos(0).diskSize === 10L)
+    assert(rddInfos(0).tachyonSize === 0L)
+    assert(rddInfos(1).storageLevel === memAndDisk)
+    assert(rddInfos(1).numCachedPartitions === 3)
+    assert(rddInfos(1).memSize === 3L)
+    assert(rddInfos(1).diskSize === 6L)
+    assert(rddInfos(1).tachyonSize === 0L)
+  }
+
+  test("StorageUtils.getRddBlockLocations") {
+    val storageStatuses = stockStorageStatuses
+    val blockLocations0 = StorageUtils.getRddBlockLocations(0, storageStatuses)
+    val blockLocations1 = StorageUtils.getRddBlockLocations(1, storageStatuses)
+    assert(blockLocations0.size === 5)
+    assert(blockLocations1.size === 3)
+    assert(blockLocations0.contains(RDDBlockId(0, 0)))
+    assert(blockLocations0.contains(RDDBlockId(0, 1)))
+    assert(blockLocations0.contains(RDDBlockId(0, 2)))
+    assert(blockLocations0.contains(RDDBlockId(0, 3)))
+    assert(blockLocations0.contains(RDDBlockId(0, 4)))
+    assert(blockLocations1.contains(RDDBlockId(1, 0)))
+    assert(blockLocations1.contains(RDDBlockId(1, 1)))
+    assert(blockLocations1.contains(RDDBlockId(1, 2)))
+    assert(blockLocations0(RDDBlockId(0, 0)) === Seq("dog:1"))
+    assert(blockLocations0(RDDBlockId(0, 1)) === Seq("dog:1"))
+    assert(blockLocations0(RDDBlockId(0, 2)) === Seq("duck:2"))
+    assert(blockLocations0(RDDBlockId(0, 3)) === Seq("duck:2"))
+    assert(blockLocations0(RDDBlockId(0, 4)) === Seq("cat:3"))
+    assert(blockLocations1(RDDBlockId(1, 0)) === Seq("duck:2"))
+    assert(blockLocations1(RDDBlockId(1, 1)) === Seq("duck:2"))
+    assert(blockLocations1(RDDBlockId(1, 2)) === Seq("cat:3"))
+  }
+
+  test("StorageUtils.getRddBlockLocations with multiple locations") {
+    val storageStatuses = stockStorageStatuses
+    storageStatuses(0).addBlock(RDDBlockId(1, 0), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    storageStatuses(0).addBlock(RDDBlockId(0, 4), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    storageStatuses(2).addBlock(RDDBlockId(0, 0), BlockStatus(memAndDisk, 1L, 2L, 0L))
+    val blockLocations0 = StorageUtils.getRddBlockLocations(0, storageStatuses)
+    val blockLocations1 = StorageUtils.getRddBlockLocations(1, storageStatuses)
+    assert(blockLocations0.size === 5)
+    assert(blockLocations1.size === 3)
+    assert(blockLocations0(RDDBlockId(0, 0)) === Seq("dog:1", "cat:3"))
+    assert(blockLocations0(RDDBlockId(0, 1)) === Seq("dog:1"))
+    assert(blockLocations0(RDDBlockId(0, 2)) === Seq("duck:2"))
+    assert(blockLocations0(RDDBlockId(0, 3)) === Seq("duck:2"))
+    assert(blockLocations0(RDDBlockId(0, 4)) === Seq("dog:1", "cat:3"))
+    assert(blockLocations1(RDDBlockId(1, 0)) === Seq("dog:1", "duck:2"))
+    assert(blockLocations1(RDDBlockId(1, 1)) === Seq("duck:2"))
+    assert(blockLocations1(RDDBlockId(1, 2)) === Seq("cat:3"))
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index fa43b66c6cb5a..cb8252515238e 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -21,11 +21,13 @@ import org.scalatest.FunSuite
 import org.scalatest.Matchers
 
 import org.apache.spark._
-import org.apache.spark.executor.{ShuffleReadMetrics, TaskMetrics}
+import org.apache.spark.{LocalSparkContext, SparkConf, Success}
+import org.apache.spark.executor.{ShuffleWriteMetrics, ShuffleReadMetrics, TaskMetrics}
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
 class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matchers {
+
   test("test LRU eviction of stages") {
     val conf = new SparkConf()
     conf.set("spark.ui.retainedStages", 5.toString)
@@ -47,11 +49,11 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
     }
 
     listener.completedStages.size should be (5)
-    listener.completedStages.filter(_.stageId == 50).size should be (1)
-    listener.completedStages.filter(_.stageId == 49).size should be (1)
-    listener.completedStages.filter(_.stageId == 48).size should be (1)
-    listener.completedStages.filter(_.stageId == 47).size should be (1)
-    listener.completedStages.filter(_.stageId == 46).size should be (1)
+    listener.completedStages.count(_.stageId == 50) should be (1)
+    listener.completedStages.count(_.stageId == 49) should be (1)
+    listener.completedStages.count(_.stageId == 48) should be (1)
+    listener.completedStages.count(_.stageId == 47) should be (1)
+    listener.completedStages.count(_.stageId == 46) should be (1)
   }
 
   test("test executor id to summary") {
@@ -59,48 +61,42 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
     val listener = new JobProgressListener(conf)
     val taskMetrics = new TaskMetrics()
     val shuffleReadMetrics = new ShuffleReadMetrics()
-
-    // nothing in it
-    assert(listener.stageIdToExecutorSummaries.size == 0)
+    assert(listener.stageIdToData.size === 0)
 
     // finish this task, should get updated shuffleRead
     shuffleReadMetrics.remoteBytesRead = 1000
-    taskMetrics.shuffleReadMetrics = Some(shuffleReadMetrics)
+    taskMetrics.updateShuffleReadMetrics(shuffleReadMetrics)
     var taskInfo = new TaskInfo(1234L, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
     taskInfo.finishTime = 1
-    var task = new ShuffleMapTask(0, null, null, 0, null)
+    var task = new ShuffleMapTask(0)
     val taskType = Utils.getFormattedClassName(task)
     listener.onTaskEnd(SparkListenerTaskEnd(task.stageId, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToExecutorSummaries.getOrElse(0, fail()).getOrElse("exe-1", fail())
-      .shuffleRead == 1000)
+    assert(listener.stageIdToData.getOrElse(0, fail()).executorSummary.getOrElse("exe-1", fail())
+      .shuffleRead === 1000)
 
     // finish a task with unknown executor-id, nothing should happen
     taskInfo =
       new TaskInfo(1234L, 0, 1, 1000L, "exe-unknown", "host1", TaskLocality.NODE_LOCAL, true)
     taskInfo.finishTime = 1
-    task = new ShuffleMapTask(0, null, null, 0, null)
+    task = new ShuffleMapTask(0)
     listener.onTaskEnd(SparkListenerTaskEnd(task.stageId, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToExecutorSummaries.size == 1)
+    assert(listener.stageIdToData.size === 1)
 
     // finish this task, should get updated duration
-    shuffleReadMetrics.remoteBytesRead = 1000
-    taskMetrics.shuffleReadMetrics = Some(shuffleReadMetrics)
     taskInfo = new TaskInfo(1235L, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
     taskInfo.finishTime = 1
-    task = new ShuffleMapTask(0, null, null, 0, null)
+    task = new ShuffleMapTask(0)
     listener.onTaskEnd(SparkListenerTaskEnd(task.stageId, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToExecutorSummaries.getOrElse(0, fail()).getOrElse("exe-1", fail())
-      .shuffleRead == 2000)
+    assert(listener.stageIdToData.getOrElse(0, fail()).executorSummary.getOrElse("exe-1", fail())
+      .shuffleRead === 2000)
 
     // finish this task, should get updated duration
-    shuffleReadMetrics.remoteBytesRead = 1000
-    taskMetrics.shuffleReadMetrics = Some(shuffleReadMetrics)
     taskInfo = new TaskInfo(1236L, 0, 2, 0L, "exe-2", "host1", TaskLocality.NODE_LOCAL, false)
     taskInfo.finishTime = 1
-    task = new ShuffleMapTask(0, null, null, 0, null)
+    task = new ShuffleMapTask(0)
     listener.onTaskEnd(SparkListenerTaskEnd(task.stageId, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToExecutorSummaries.getOrElse(0, fail()).getOrElse("exe-2", fail())
-      .shuffleRead == 1000)
+    assert(listener.stageIdToData.getOrElse(0, fail()).executorSummary.getOrElse("exe-2", fail())
+      .shuffleRead === 1000)
   }
 
   test("test task success vs failure counting for different task end reasons") {
@@ -109,7 +105,7 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
     val metrics = new TaskMetrics()
     val taskInfo = new TaskInfo(1234L, 0, 3, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
     taskInfo.finishTime = 1
-    val task = new ShuffleMapTask(0, null, null, 0, null)
+    val task = new ShuffleMapTask(0)
     val taskType = Utils.getFormattedClassName(task)
 
     // Go through all the failure cases to make sure we are counting them as failures.
@@ -121,13 +117,100 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
       TaskKilled,
       ExecutorLostFailure,
       UnknownReason)
+    var failCount = 0
     for (reason <- taskFailedReasons) {
       listener.onTaskEnd(SparkListenerTaskEnd(task.stageId, taskType, reason, taskInfo, metrics))
-      assert(listener.stageIdToTasksComplete.get(task.stageId) === None)
+      failCount += 1
+      assert(listener.stageIdToData(task.stageId).numCompleteTasks === 0)
+      assert(listener.stageIdToData(task.stageId).numFailedTasks === failCount)
     }
 
     // Make sure we count success as success.
     listener.onTaskEnd(SparkListenerTaskEnd(task.stageId, taskType, Success, taskInfo, metrics))
-    assert(listener.stageIdToTasksComplete.get(task.stageId) === Some(1))
+    assert(listener.stageIdToData(task.stageId).numCompleteTasks === 1)
+    assert(listener.stageIdToData(task.stageId).numFailedTasks === failCount)
+  }
+
+  test("test update metrics") {
+    val conf = new SparkConf()
+    val listener = new JobProgressListener(conf)
+
+    val taskType = Utils.getFormattedClassName(new ShuffleMapTask(0))
+    val execId = "exe-1"
+
+    def makeTaskMetrics(base: Int) = {
+      val taskMetrics = new TaskMetrics()
+      val shuffleReadMetrics = new ShuffleReadMetrics()
+      val shuffleWriteMetrics = new ShuffleWriteMetrics()
+      taskMetrics.updateShuffleReadMetrics(shuffleReadMetrics)
+      taskMetrics.shuffleWriteMetrics = Some(shuffleWriteMetrics)
+      shuffleReadMetrics.remoteBytesRead = base + 1
+      shuffleReadMetrics.remoteBlocksFetched = base + 2
+      shuffleWriteMetrics.shuffleBytesWritten = base + 3
+      taskMetrics.executorRunTime = base + 4
+      taskMetrics.diskBytesSpilled = base + 5
+      taskMetrics.memoryBytesSpilled = base + 6
+      taskMetrics
+    }
+
+    def makeTaskInfo(taskId: Long, finishTime: Int = 0) = {
+      val taskInfo = new TaskInfo(taskId, 0, 1, 0L, execId, "host1", TaskLocality.NODE_LOCAL,
+        false)
+      taskInfo.finishTime = finishTime
+      taskInfo
+    }
+
+    listener.onTaskStart(SparkListenerTaskStart(0, makeTaskInfo(1234L)))
+    listener.onTaskStart(SparkListenerTaskStart(0, makeTaskInfo(1235L)))
+    listener.onTaskStart(SparkListenerTaskStart(1, makeTaskInfo(1236L)))
+    listener.onTaskStart(SparkListenerTaskStart(1, makeTaskInfo(1237L)))
+
+    listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(execId, Array(
+      (1234L, 0, makeTaskMetrics(0)),
+      (1235L, 0, makeTaskMetrics(100)),
+      (1236L, 1, makeTaskMetrics(200)))))
+
+    var stage0Data = listener.stageIdToData.get(0).get
+    var stage1Data = listener.stageIdToData.get(1).get
+    assert(stage0Data.shuffleReadBytes == 102)
+    assert(stage1Data.shuffleReadBytes == 201)
+    assert(stage0Data.shuffleWriteBytes == 106)
+    assert(stage1Data.shuffleWriteBytes == 203)
+    assert(stage0Data.executorRunTime == 108)
+    assert(stage1Data.executorRunTime == 204)
+    assert(stage0Data.diskBytesSpilled == 110)
+    assert(stage1Data.diskBytesSpilled == 205)
+    assert(stage0Data.memoryBytesSpilled == 112)
+    assert(stage1Data.memoryBytesSpilled == 206)
+    assert(stage0Data.taskData.get(1234L).get.taskMetrics.get.shuffleReadMetrics.get
+      .totalBlocksFetched == 2)
+    assert(stage0Data.taskData.get(1235L).get.taskMetrics.get.shuffleReadMetrics.get
+      .totalBlocksFetched == 102)
+    assert(stage1Data.taskData.get(1236L).get.taskMetrics.get.shuffleReadMetrics.get
+      .totalBlocksFetched == 202)
+
+    // task that was included in a heartbeat
+    listener.onTaskEnd(SparkListenerTaskEnd(0, taskType, Success, makeTaskInfo(1234L, 1),
+      makeTaskMetrics(300)))
+    // task that wasn't included in a heartbeat
+    listener.onTaskEnd(SparkListenerTaskEnd(1, taskType, Success, makeTaskInfo(1237L, 1),
+      makeTaskMetrics(400)))
+
+    stage0Data = listener.stageIdToData.get(0).get
+    stage1Data = listener.stageIdToData.get(1).get
+    assert(stage0Data.shuffleReadBytes == 402)
+    assert(stage1Data.shuffleReadBytes == 602)
+    assert(stage0Data.shuffleWriteBytes == 406)
+    assert(stage1Data.shuffleWriteBytes == 606)
+    assert(stage0Data.executorRunTime == 408)
+    assert(stage1Data.executorRunTime == 608)
+    assert(stage0Data.diskBytesSpilled == 410)
+    assert(stage1Data.diskBytesSpilled == 610)
+    assert(stage0Data.memoryBytesSpilled == 412)
+    assert(stage1Data.memoryBytesSpilled == 612)
+    assert(stage0Data.taskData.get(1234L).get.taskMetrics.get.shuffleReadMetrics.get
+      .totalBlocksFetched == 302)
+    assert(stage1Data.taskData.get(1237L).get.taskMetrics.get.shuffleReadMetrics.get
+      .totalBlocksFetched == 402)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
new file mode 100644
index 0000000000000..6e68dcb3425aa
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui.storage
+
+import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.apache.spark.Success
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler._
+import org.apache.spark.storage._
+
+/**
+ * Test various functionality in the StorageListener that supports the StorageTab.
+ */
+class StorageTabSuite extends FunSuite with BeforeAndAfter {
+  private var bus: LiveListenerBus = _
+  private var storageStatusListener: StorageStatusListener = _
+  private var storageListener: StorageListener = _
+  private val memAndDisk = StorageLevel.MEMORY_AND_DISK
+  private val memOnly = StorageLevel.MEMORY_ONLY
+  private val none = StorageLevel.NONE
+  private val taskInfo = new TaskInfo(0, 0, 0, 0, "big", "dog", TaskLocality.ANY, false)
+  private def rddInfo0 = new RDDInfo(0, "freedom", 100, memOnly)
+  private def rddInfo1 = new RDDInfo(1, "hostage", 200, memOnly)
+  private def rddInfo2 = new RDDInfo(2, "sanity", 300, memAndDisk)
+  private def rddInfo3 = new RDDInfo(3, "grace", 400, memAndDisk)
+  private val bm1 = BlockManagerId("big", "dog", 1, 1)
+
+  before {
+    bus = new LiveListenerBus
+    storageStatusListener = new StorageStatusListener
+    storageListener = new StorageListener(storageStatusListener)
+    bus.addListener(storageStatusListener)
+    bus.addListener(storageListener)
+  }
+
+  test("stage submitted / completed") {
+    assert(storageListener._rddInfoMap.isEmpty)
+    assert(storageListener.rddInfoList.isEmpty)
+
+    // 2 RDDs are known, but none are cached
+    val stageInfo0 = new StageInfo(0, "0", 100, Seq(rddInfo0, rddInfo1), "details")
+    bus.postToAll(SparkListenerStageSubmitted(stageInfo0))
+    assert(storageListener._rddInfoMap.size === 2)
+    assert(storageListener.rddInfoList.isEmpty)
+
+    // 4 RDDs are known, but only 2 are cached
+    val rddInfo2Cached = rddInfo2
+    val rddInfo3Cached = rddInfo3
+    rddInfo2Cached.numCachedPartitions = 1
+    rddInfo3Cached.numCachedPartitions = 1
+    val stageInfo1 = new StageInfo(1, "0", 100, Seq(rddInfo2Cached, rddInfo3Cached), "details")
+    bus.postToAll(SparkListenerStageSubmitted(stageInfo1))
+    assert(storageListener._rddInfoMap.size === 4)
+    assert(storageListener.rddInfoList.size === 2)
+
+    // Submitting RDDInfos with duplicate IDs does nothing
+    val rddInfo0Cached = new RDDInfo(0, "freedom", 100, StorageLevel.MEMORY_ONLY)
+    rddInfo0Cached.numCachedPartitions = 1
+    val stageInfo0Cached = new StageInfo(0, "0", 100, Seq(rddInfo0), "details")
+    bus.postToAll(SparkListenerStageSubmitted(stageInfo0Cached))
+    assert(storageListener._rddInfoMap.size === 4)
+    assert(storageListener.rddInfoList.size === 2)
+
+    // We only keep around the RDDs that are cached
+    bus.postToAll(SparkListenerStageCompleted(stageInfo0))
+    assert(storageListener._rddInfoMap.size === 2)
+    assert(storageListener.rddInfoList.size === 2)
+  }
+
+  test("unpersist") {
+    val rddInfo0Cached = rddInfo0
+    val rddInfo1Cached = rddInfo1
+    rddInfo0Cached.numCachedPartitions = 1
+    rddInfo1Cached.numCachedPartitions = 1
+    val stageInfo0 = new StageInfo(0, "0", 100, Seq(rddInfo0Cached, rddInfo1Cached), "details")
+    bus.postToAll(SparkListenerStageSubmitted(stageInfo0))
+    assert(storageListener._rddInfoMap.size === 2)
+    assert(storageListener.rddInfoList.size === 2)
+    bus.postToAll(SparkListenerUnpersistRDD(0))
+    assert(storageListener._rddInfoMap.size === 1)
+    assert(storageListener.rddInfoList.size === 1)
+    bus.postToAll(SparkListenerUnpersistRDD(4)) // doesn't exist
+    assert(storageListener._rddInfoMap.size === 1)
+    assert(storageListener.rddInfoList.size === 1)
+    bus.postToAll(SparkListenerUnpersistRDD(1))
+    assert(storageListener._rddInfoMap.size === 0)
+    assert(storageListener.rddInfoList.size === 0)
+  }
+
+  test("task end") {
+    val myRddInfo0 = rddInfo0
+    val myRddInfo1 = rddInfo1
+    val myRddInfo2 = rddInfo2
+    val stageInfo0 = new StageInfo(0, "0", 100, Seq(myRddInfo0, myRddInfo1, myRddInfo2), "details")
+    bus.postToAll(SparkListenerBlockManagerAdded(bm1, 1000L))
+    bus.postToAll(SparkListenerStageSubmitted(stageInfo0))
+    assert(storageListener._rddInfoMap.size === 3)
+    assert(storageListener.rddInfoList.size === 0) // not cached
+    assert(!storageListener._rddInfoMap(0).isCached)
+    assert(!storageListener._rddInfoMap(1).isCached)
+    assert(!storageListener._rddInfoMap(2).isCached)
+
+    // Task end with no updated blocks. This should not change anything.
+    bus.postToAll(SparkListenerTaskEnd(0, "obliteration", Success, taskInfo, new TaskMetrics))
+    assert(storageListener._rddInfoMap.size === 3)
+    assert(storageListener.rddInfoList.size === 0)
+
+    // Task end with a few new persisted blocks, some from the same RDD
+    val metrics1 = new TaskMetrics
+    metrics1.updatedBlocks = Some(Seq(
+      (RDDBlockId(0, 100), BlockStatus(memAndDisk, 400L, 0L, 0L)),
+      (RDDBlockId(0, 101), BlockStatus(memAndDisk, 0L, 400L, 0L)),
+      (RDDBlockId(0, 102), BlockStatus(memAndDisk, 400L, 0L, 200L)),
+      (RDDBlockId(1, 20), BlockStatus(memAndDisk, 0L, 240L, 0L))
+    ))
+    bus.postToAll(SparkListenerTaskEnd(1, "obliteration", Success, taskInfo, metrics1))
+    assert(storageListener._rddInfoMap(0).memSize === 800L)
+    assert(storageListener._rddInfoMap(0).diskSize === 400L)
+    assert(storageListener._rddInfoMap(0).tachyonSize === 200L)
+    assert(storageListener._rddInfoMap(0).numCachedPartitions === 3)
+    assert(storageListener._rddInfoMap(0).isCached)
+    assert(storageListener._rddInfoMap(1).memSize === 0L)
+    assert(storageListener._rddInfoMap(1).diskSize === 240L)
+    assert(storageListener._rddInfoMap(1).tachyonSize === 0L)
+    assert(storageListener._rddInfoMap(1).numCachedPartitions === 1)
+    assert(storageListener._rddInfoMap(1).isCached)
+    assert(!storageListener._rddInfoMap(2).isCached)
+    assert(storageListener._rddInfoMap(2).numCachedPartitions === 0)
+
+    // Task end with a few dropped blocks
+    val metrics2 = new TaskMetrics
+    metrics2.updatedBlocks = Some(Seq(
+      (RDDBlockId(0, 100), BlockStatus(none, 0L, 0L, 0L)),
+      (RDDBlockId(1, 20), BlockStatus(none, 0L, 0L, 0L)),
+      (RDDBlockId(2, 40), BlockStatus(none, 0L, 0L, 0L)), // doesn't actually exist
+      (RDDBlockId(4, 80), BlockStatus(none, 0L, 0L, 0L)) // doesn't actually exist
+    ))
+    bus.postToAll(SparkListenerTaskEnd(2, "obliteration", Success, taskInfo, metrics2))
+    assert(storageListener._rddInfoMap(0).memSize === 400L)
+    assert(storageListener._rddInfoMap(0).diskSize === 400L)
+    assert(storageListener._rddInfoMap(0).tachyonSize === 200L)
+    assert(storageListener._rddInfoMap(0).numCachedPartitions === 2)
+    assert(storageListener._rddInfoMap(0).isCached)
+    assert(!storageListener._rddInfoMap(1).isCached)
+    assert(storageListener._rddInfoMap(2).numCachedPartitions === 0)
+    assert(!storageListener._rddInfoMap(2).isCached)
+    assert(storageListener._rddInfoMap(2).numCachedPartitions === 0)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
similarity index 99%
rename from core/src/test/scala/org/apache/spark/AkkaUtilsSuite.scala
rename to core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
index 4ab870e751778..c4765e53de17b 100644
--- a/core/src/test/scala/org/apache/spark/AkkaUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark
-
-import org.scalatest.FunSuite
+package org.apache.spark.util
 
 import akka.actor._
+import org.apache.spark._
 import org.apache.spark.scheduler.MapStatus
 import org.apache.spark.storage.BlockManagerId
-import org.apache.spark.util.AkkaUtils
+import org.scalatest.FunSuite
+
 import scala.concurrent.Await
 
 /**
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index ca37d707b06ca..d2bee448d4d3b 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -135,12 +135,11 @@ class FileAppenderSuite extends FunSuite with BeforeAndAfter with Logging {
       val testOutputStream = new PipedOutputStream()
       val testInputStream = new PipedInputStream(testOutputStream)
       val appender = FileAppender(testInputStream, testFile, conf)
-      assert(appender.isInstanceOf[ExpectedAppender])
+      //assert(appender.getClass === classTag[ExpectedAppender].getClass)
       assert(appender.getClass.getSimpleName ===
         classTag[ExpectedAppender].runtimeClass.getSimpleName)
       if (appender.isInstanceOf[RollingFileAppender]) {
         val rollingPolicy = appender.asInstanceOf[RollingFileAppender].rollingPolicy
-        rollingPolicy.isInstanceOf[ExpectedRollingPolicy]
         val policyParam = if (rollingPolicy.isInstanceOf[TimeBasedRollingPolicy]) {
           rollingPolicy.asInstanceOf[TimeBasedRollingPolicy].rolloverIntervalMillis
         } else {
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 002bba60ec533..5f91284f86afa 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -323,7 +323,6 @@ class JsonProtocolSuite extends FunSuite {
 
   private def assertEquals(metrics1: ShuffleReadMetrics, metrics2: ShuffleReadMetrics) {
     assert(metrics1.shuffleFinishTime === metrics2.shuffleFinishTime)
-    assert(metrics1.totalBlocksFetched === metrics2.totalBlocksFetched)
     assert(metrics1.remoteBlocksFetched === metrics2.remoteBlocksFetched)
     assert(metrics1.localBlocksFetched === metrics2.localBlocksFetched)
     assert(metrics1.fetchWaitTime === metrics2.fetchWaitTime)
@@ -530,12 +529,11 @@ class JsonProtocolSuite extends FunSuite {
     } else {
       val sr = new ShuffleReadMetrics
       sr.shuffleFinishTime = b + c
-      sr.totalBlocksFetched = e + f
       sr.remoteBytesRead = b + d
       sr.localBlocksFetched = e
       sr.fetchWaitTime = a + d
       sr.remoteBlocksFetched = f
-      t.shuffleReadMetrics = Some(sr)
+      t.updateShuffleReadMetrics(sr)
     }
     sw.shuffleBytesWritten = a + b + c
     sw.shuffleWriteTime = b + c + d
@@ -605,7 +603,6 @@ class JsonProtocolSuite extends FunSuite {
       |  "Memory Bytes Spilled":800,"Disk Bytes Spilled":0,
       |  "Shuffle Read Metrics":{
       |    "Shuffle Finish Time":900,
-      |    "Total Blocks Fetched":1500,
       |    "Remote Blocks Fetched":800,
       |    "Local Blocks Fetched":700,
       |    "Fetch Wait Time":900,
diff --git a/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala b/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala
index cf438a3d72a06..72e81f3f1a884 100644
--- a/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala
@@ -27,45 +27,45 @@ import org.scalatest.Matchers
 class NextIteratorSuite extends FunSuite with Matchers {
   test("one iteration") {
     val i = new StubIterator(Buffer(1))
-    i.hasNext should be === true
-    i.next should be === 1
-    i.hasNext should be === false
+    i.hasNext should be (true)
+    i.next should be (1)
+    i.hasNext should be (false)
     intercept[NoSuchElementException] { i.next() }
   }
 
   test("two iterations") {
     val i = new StubIterator(Buffer(1, 2))
-    i.hasNext should be === true
-    i.next should be === 1
-    i.hasNext should be === true
-    i.next should be === 2
-    i.hasNext should be === false
+    i.hasNext should be (true)
+    i.next should be (1)
+    i.hasNext should be (true)
+    i.next should be (2)
+    i.hasNext should be (false)
     intercept[NoSuchElementException] { i.next() }
   }
 
   test("empty iteration") {
     val i = new StubIterator(Buffer())
-    i.hasNext should be === false
+    i.hasNext should be (false)
     intercept[NoSuchElementException] { i.next() }
   }
 
   test("close is called once for empty iterations") {
     val i = new StubIterator(Buffer())
-    i.hasNext should be === false
-    i.hasNext should be === false
-    i.closeCalled should be === 1
+    i.hasNext should be (false)
+    i.hasNext should be (false)
+    i.closeCalled should be (1)
   }
 
   test("close is called once for non-empty iterations") {
     val i = new StubIterator(Buffer(1, 2))
-    i.next should be === 1
-    i.next should be === 2
+    i.next should be (1)
+    i.next should be (2)
     // close isn't called until we check for the next element
-    i.closeCalled should be === 0
-    i.hasNext should be === false
-    i.closeCalled should be === 1
-    i.hasNext should be === false
-    i.closeCalled should be === 1
+    i.closeCalled should be (0)
+    i.hasNext should be (false)
+    i.closeCalled should be (1)
+    i.hasNext should be (false)
+    i.closeCalled should be (1)
   }
 
   class StubIterator(ints: Buffer[Int])  extends NextIterator[Int] {
diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
index b583a8bd46ae5..f9d1af88f3a13 100644
--- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala
@@ -63,53 +63,53 @@ class SizeEstimatorSuite
   }
 
   test("simple classes") {
-    expectResult(16)(SizeEstimator.estimate(new DummyClass1))
-    expectResult(16)(SizeEstimator.estimate(new DummyClass2))
-    expectResult(24)(SizeEstimator.estimate(new DummyClass3))
-    expectResult(24)(SizeEstimator.estimate(new DummyClass4(null)))
-    expectResult(48)(SizeEstimator.estimate(new DummyClass4(new DummyClass3)))
+    assertResult(16)(SizeEstimator.estimate(new DummyClass1))
+    assertResult(16)(SizeEstimator.estimate(new DummyClass2))
+    assertResult(24)(SizeEstimator.estimate(new DummyClass3))
+    assertResult(24)(SizeEstimator.estimate(new DummyClass4(null)))
+    assertResult(48)(SizeEstimator.estimate(new DummyClass4(new DummyClass3)))
   }
 
   // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors
   // (Sun vs IBM). Use a DummyString class to make tests deterministic.
   test("strings") {
-    expectResult(40)(SizeEstimator.estimate(DummyString("")))
-    expectResult(48)(SizeEstimator.estimate(DummyString("a")))
-    expectResult(48)(SizeEstimator.estimate(DummyString("ab")))
-    expectResult(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
+    assertResult(40)(SizeEstimator.estimate(DummyString("")))
+    assertResult(48)(SizeEstimator.estimate(DummyString("a")))
+    assertResult(48)(SizeEstimator.estimate(DummyString("ab")))
+    assertResult(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
   }
 
   test("primitive arrays") {
-    expectResult(32)(SizeEstimator.estimate(new Array[Byte](10)))
-    expectResult(40)(SizeEstimator.estimate(new Array[Char](10)))
-    expectResult(40)(SizeEstimator.estimate(new Array[Short](10)))
-    expectResult(56)(SizeEstimator.estimate(new Array[Int](10)))
-    expectResult(96)(SizeEstimator.estimate(new Array[Long](10)))
-    expectResult(56)(SizeEstimator.estimate(new Array[Float](10)))
-    expectResult(96)(SizeEstimator.estimate(new Array[Double](10)))
-    expectResult(4016)(SizeEstimator.estimate(new Array[Int](1000)))
-    expectResult(8016)(SizeEstimator.estimate(new Array[Long](1000)))
+    assertResult(32)(SizeEstimator.estimate(new Array[Byte](10)))
+    assertResult(40)(SizeEstimator.estimate(new Array[Char](10)))
+    assertResult(40)(SizeEstimator.estimate(new Array[Short](10)))
+    assertResult(56)(SizeEstimator.estimate(new Array[Int](10)))
+    assertResult(96)(SizeEstimator.estimate(new Array[Long](10)))
+    assertResult(56)(SizeEstimator.estimate(new Array[Float](10)))
+    assertResult(96)(SizeEstimator.estimate(new Array[Double](10)))
+    assertResult(4016)(SizeEstimator.estimate(new Array[Int](1000)))
+    assertResult(8016)(SizeEstimator.estimate(new Array[Long](1000)))
   }
 
   test("object arrays") {
     // Arrays containing nulls should just have one pointer per element
-    expectResult(56)(SizeEstimator.estimate(new Array[String](10)))
-    expectResult(56)(SizeEstimator.estimate(new Array[AnyRef](10)))
+    assertResult(56)(SizeEstimator.estimate(new Array[String](10)))
+    assertResult(56)(SizeEstimator.estimate(new Array[AnyRef](10)))
     // For object arrays with non-null elements, each object should take one pointer plus
     // however many bytes that class takes. (Note that Array.fill calls the code in its
     // second parameter separately for each object, so we get distinct objects.)
-    expectResult(216)(SizeEstimator.estimate(Array.fill(10)(new DummyClass1)))
-    expectResult(216)(SizeEstimator.estimate(Array.fill(10)(new DummyClass2)))
-    expectResult(296)(SizeEstimator.estimate(Array.fill(10)(new DummyClass3)))
-    expectResult(56)(SizeEstimator.estimate(Array(new DummyClass1, new DummyClass2)))
+    assertResult(216)(SizeEstimator.estimate(Array.fill(10)(new DummyClass1)))
+    assertResult(216)(SizeEstimator.estimate(Array.fill(10)(new DummyClass2)))
+    assertResult(296)(SizeEstimator.estimate(Array.fill(10)(new DummyClass3)))
+    assertResult(56)(SizeEstimator.estimate(Array(new DummyClass1, new DummyClass2)))
 
     // Past size 100, our samples 100 elements, but we should still get the right size.
-    expectResult(28016)(SizeEstimator.estimate(Array.fill(1000)(new DummyClass3)))
+    assertResult(28016)(SizeEstimator.estimate(Array.fill(1000)(new DummyClass3)))
 
     // If an array contains the *same* element many times, we should only count it once.
     val d1 = new DummyClass1
-    expectResult(72)(SizeEstimator.estimate(Array.fill(10)(d1))) // 10 pointers plus 8-byte object
-    expectResult(432)(SizeEstimator.estimate(Array.fill(100)(d1))) // 100 pointers plus 8-byte object
+    assertResult(72)(SizeEstimator.estimate(Array.fill(10)(d1))) // 10 pointers plus 8-byte object
+    assertResult(432)(SizeEstimator.estimate(Array.fill(100)(d1))) // 100 pointers plus 8-byte object
 
     // Same thing with huge array containing the same element many times. Note that this won't
     // return exactly 4032 because it can't tell that *all* the elements will equal the first
@@ -127,10 +127,10 @@ class SizeEstimatorSuite
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
 
-    expectResult(40)(SizeEstimator.estimate(DummyString("")))
-    expectResult(48)(SizeEstimator.estimate(DummyString("a")))
-    expectResult(48)(SizeEstimator.estimate(DummyString("ab")))
-    expectResult(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
+    assertResult(40)(SizeEstimator.estimate(DummyString("")))
+    assertResult(48)(SizeEstimator.estimate(DummyString("a")))
+    assertResult(48)(SizeEstimator.estimate(DummyString("ab")))
+    assertResult(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
     resetOrClear("os.arch", arch)
   }
 
@@ -142,10 +142,10 @@ class SizeEstimatorSuite
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
 
-    expectResult(56)(SizeEstimator.estimate(DummyString("")))
-    expectResult(64)(SizeEstimator.estimate(DummyString("a")))
-    expectResult(64)(SizeEstimator.estimate(DummyString("ab")))
-    expectResult(72)(SizeEstimator.estimate(DummyString("abcdefgh")))
+    assertResult(56)(SizeEstimator.estimate(DummyString("")))
+    assertResult(64)(SizeEstimator.estimate(DummyString("a")))
+    assertResult(64)(SizeEstimator.estimate(DummyString("ab")))
+    assertResult(72)(SizeEstimator.estimate(DummyString("abcdefgh")))
 
     resetOrClear("os.arch", arch)
     resetOrClear("spark.test.useCompressedOops", oops)
diff --git a/core/src/test/scala/org/apache/spark/util/SizeTrackingAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeTrackingAppendOnlyMapSuite.scala
deleted file mode 100644
index 93f0c6a8e6408..0000000000000
--- a/core/src/test/scala/org/apache/spark/util/SizeTrackingAppendOnlyMapSuite.scala
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util
-
-import scala.util.Random
-
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-
-import org.apache.spark.util.SizeTrackingAppendOnlyMapSuite.LargeDummyClass
-import org.apache.spark.util.collection.{AppendOnlyMap, SizeTrackingAppendOnlyMap}
-
-class SizeTrackingAppendOnlyMapSuite extends FunSuite with BeforeAndAfterAll {
-  val NORMAL_ERROR = 0.20
-  val HIGH_ERROR = 0.30
-
-  test("fixed size insertions") {
-    testWith[Int, Long](10000, i => (i, i.toLong))
-    testWith[Int, (Long, Long)](10000, i => (i, (i.toLong, i.toLong)))
-    testWith[Int, LargeDummyClass](10000, i => (i, new LargeDummyClass()))
-  }
-
-  test("variable size insertions") {
-    val rand = new Random(123456789)
-    def randString(minLen: Int, maxLen: Int): String = {
-      "a" * (rand.nextInt(maxLen - minLen) + minLen)
-    }
-    testWith[Int, String](10000, i => (i, randString(0, 10)))
-    testWith[Int, String](10000, i => (i, randString(0, 100)))
-    testWith[Int, String](10000, i => (i, randString(90, 100)))
-  }
-
-  test("updates") {
-    val rand = new Random(123456789)
-    def randString(minLen: Int, maxLen: Int): String = {
-      "a" * (rand.nextInt(maxLen - minLen) + minLen)
-    }
-    testWith[String, Int](10000, i => (randString(0, 10000), i))
-  }
-
-  def testWith[K, V](numElements: Int, makeElement: (Int) => (K, V)) {
-    val map = new SizeTrackingAppendOnlyMap[K, V]()
-    for (i <- 0 until numElements) {
-      val (k, v) = makeElement(i)
-      map(k) = v
-      expectWithinError(map, map.estimateSize(), if (i < 32) HIGH_ERROR else NORMAL_ERROR)
-    }
-  }
-
-  def expectWithinError(obj: AnyRef, estimatedSize: Long, error: Double) {
-    val betterEstimatedSize = SizeEstimator.estimate(obj)
-    assert(betterEstimatedSize * (1 - error) < estimatedSize,
-      s"Estimated size $estimatedSize was less than expected size $betterEstimatedSize")
-    assert(betterEstimatedSize * (1 + 2 * error) > estimatedSize,
-      s"Estimated size $estimatedSize was greater than expected size $betterEstimatedSize")
-  }
-}
-
-object SizeTrackingAppendOnlyMapSuite {
-  // Speed test, for reproducibility of results.
-  // These could be highly non-deterministic in general, however.
-  // Results:
-  // AppendOnlyMap:   31 ms
-  // SizeTracker:     54 ms
-  // SizeEstimator: 1500 ms
-  def main(args: Array[String]) {
-    val numElements = 100000
-
-    val baseTimes = for (i <- 0 until 10) yield time {
-      val map = new AppendOnlyMap[Int, LargeDummyClass]()
-      for (i <- 0 until numElements) {
-        map(i) = new LargeDummyClass()
-      }
-    }
-
-    val sampledTimes = for (i <- 0 until 10) yield time {
-      val map = new SizeTrackingAppendOnlyMap[Int, LargeDummyClass]()
-      for (i <- 0 until numElements) {
-        map(i) = new LargeDummyClass()
-        map.estimateSize()
-      }
-    }
-
-    val unsampledTimes = for (i <- 0 until 3) yield time {
-      val map = new AppendOnlyMap[Int, LargeDummyClass]()
-      for (i <- 0 until numElements) {
-        map(i) = new LargeDummyClass()
-        SizeEstimator.estimate(map)
-      }
-    }
-
-    println("Base: " + baseTimes)
-    println("SizeTracker (sampled): " + sampledTimes)
-    println("SizeEstimator (unsampled): " + unsampledTimes)
-  }
-
-  def time(f: => Unit): Long = {
-    val start = System.currentTimeMillis()
-    f
-    System.currentTimeMillis() - start
-  }
-
-  private class LargeDummyClass {
-    val arr = new Array[Int](100)
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/util/VectorSuite.scala b/core/src/test/scala/org/apache/spark/util/VectorSuite.scala
index 7006571ef0ef6..794a55d61750b 100644
--- a/core/src/test/scala/org/apache/spark/util/VectorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/VectorSuite.scala
@@ -24,6 +24,7 @@ import org.scalatest.FunSuite
 /**
  * Tests org.apache.spark.util.Vector functionality
  */
+@deprecated("suppress compile time deprecation warning", "1.0.0")
 class VectorSuite extends FunSuite {
 
   def verifyVector(vector: Vector, expectedLength: Int) = {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
index 52c7288e18b69..cb99d14b27af4 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
@@ -170,10 +170,10 @@ class AppendOnlyMapSuite extends FunSuite {
       case e: IllegalStateException => fail()
     }
 
-    val it = map.destructiveSortedIterator(new Comparator[(String, String)] {
-      def compare(kv1: (String, String), kv2: (String, String)): Int = {
-        val x = if (kv1 != null && kv1._1 != null) kv1._1.toInt else Int.MinValue
-        val y = if (kv2 != null && kv2._1 != null) kv2._1.toInt else Int.MinValue
+    val it = map.destructiveSortedIterator(new Comparator[String] {
+      def compare(key1: String, key2: String): Int = {
+        val x = if (key1 != null) key1.toInt else Int.MinValue
+        val y = if (key2 != null) key2.toInt else Int.MinValue
         x.compareTo(y)
       }
     })
diff --git a/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala
new file mode 100644
index 0000000000000..6c956d93dc80d
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/CompactBufferSuite.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import org.scalatest.FunSuite
+
+class CompactBufferSuite extends FunSuite {
+  test("empty buffer") {
+    val b = new CompactBuffer[Int]
+    assert(b.size === 0)
+    assert(b.iterator.toList === Nil)
+    assert(b.size === 0)
+    assert(b.iterator.toList === Nil)
+    intercept[IndexOutOfBoundsException] { b(0) }
+    intercept[IndexOutOfBoundsException] { b(1) }
+    intercept[IndexOutOfBoundsException] { b(2) }
+    intercept[IndexOutOfBoundsException] { b(-1) }
+  }
+
+  test("basic inserts") {
+    val b = new CompactBuffer[Int]
+    assert(b.size === 0)
+    assert(b.iterator.toList === Nil)
+    for (i <- 0 until 1000) {
+      b += i
+      assert(b.size === i + 1)
+      assert(b(i) === i)
+    }
+    assert(b.iterator.toList === (0 until 1000).toList)
+    assert(b.iterator.toList === (0 until 1000).toList)
+    assert(b.size === 1000)
+  }
+
+  test("adding sequences") {
+    val b = new CompactBuffer[Int]
+    assert(b.size === 0)
+    assert(b.iterator.toList === Nil)
+
+    // Add some simple lists and iterators
+    b ++= List(0)
+    assert(b.size === 1)
+    assert(b.iterator.toList === List(0))
+    b ++= Iterator(1)
+    assert(b.size === 2)
+    assert(b.iterator.toList === List(0, 1))
+    b ++= List(2)
+    assert(b.size === 3)
+    assert(b.iterator.toList === List(0, 1, 2))
+    b ++= Iterator(3, 4, 5, 6, 7, 8, 9)
+    assert(b.size === 10)
+    assert(b.iterator.toList === (0 until 10).toList)
+
+    // Add CompactBuffers
+    val b2 = new CompactBuffer[Int]
+    b2 ++= 0 until 10
+    b ++= b2
+    assert(b.iterator.toList === (1 to 2).flatMap(i => 0 until 10).toList)
+    b ++= b2
+    assert(b.iterator.toList === (1 to 3).flatMap(i => 0 until 10).toList)
+    b ++= b2
+    assert(b.iterator.toList === (1 to 4).flatMap(i => 0 until 10).toList)
+
+    // Add some small CompactBuffers as well
+    val b3 = new CompactBuffer[Int]
+    b ++= b3
+    assert(b.iterator.toList === (1 to 4).flatMap(i => 0 until 10).toList)
+    b3 += 0
+    b ++= b3
+    assert(b.iterator.toList === (1 to 4).flatMap(i => 0 until 10).toList ++ List(0))
+    b3 += 1
+    b ++= b3
+    assert(b.iterator.toList === (1 to 4).flatMap(i => 0 until 10).toList ++ List(0, 0, 1))
+    b3 += 2
+    b ++= b3
+    assert(b.iterator.toList === (1 to 4).flatMap(i => 0 until 10).toList ++ List(0, 0, 1, 0, 1, 2))
+  }
+
+  test("adding the same buffer to itself") {
+    val b = new CompactBuffer[Int]
+    assert(b.size === 0)
+    assert(b.iterator.toList === Nil)
+    b += 1
+    assert(b.toList === List(1))
+    for (j <- 1 until 8) {
+      b ++= b
+      assert(b.size === (1 << j))
+      assert(b.iterator.toList === (1 to (1 << j)).map(i => 1).toList)
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 428822949c085..7de5df6e1c8bd 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -63,12 +63,13 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
     val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](createCombiner,
       mergeValue, mergeCombiners)
 
-    map.insert(1, 10)
-    map.insert(2, 20)
-    map.insert(3, 30)
-    map.insert(1, 100)
-    map.insert(2, 200)
-    map.insert(1, 1000)
+    map.insertAll(Seq(
+      (1, 10),
+      (2, 20),
+      (3, 30),
+      (1, 100),
+      (2, 200),
+      (1, 1000)))
     val it = map.iterator
     assert(it.hasNext)
     val result = it.toSet[(Int, ArrayBuffer[Int])].map(kv => (kv._1, kv._2.toSet))
@@ -207,11 +208,8 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
     val resultA = rddA.reduceByKey(math.max).collect()
     assert(resultA.length == 50000)
     resultA.foreach { case(k, v) =>
-      k match {
-        case 0 => assert(v == 1)
-        case 25000 => assert(v == 50001)
-        case 49999 => assert(v == 99999)
-        case _ =>
+      if (v != k * 2 + 1) {
+        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
       }
     }
 
@@ -220,11 +218,9 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
     val resultB = rddB.groupByKey().collect()
     assert(resultB.length == 25000)
     resultB.foreach { case(i, seq) =>
-      i match {
-        case 0 => assert(seq.toSet == Set[Int](0, 1, 2, 3))
-        case 12500 => assert(seq.toSet == Set[Int](50000, 50001, 50002, 50003))
-        case 24999 => assert(seq.toSet == Set[Int](99996, 99997, 99998, 99999))
-        case _ =>
+      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
+      if (seq.toSet != expected) {
+        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
       }
     }
 
@@ -238,6 +234,9 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
         case 0 =>
           assert(seq1.toSet == Set[Int](0))
           assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
+        case 1 =>
+          assert(seq1.toSet == Set[Int](1))
+          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
         case 5000 =>
           assert(seq1.toSet == Set[Int](5000))
           assert(seq2.toSet == Set[Int]())
@@ -282,7 +281,7 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
       assert(w1.hashCode === w2.hashCode)
     }
 
-    (1 to 100000).map(_.toString).foreach { i => map.insert(i, i) }
+    map.insertAll((1 to 100000).iterator.map(_.toString).map(i => (i, i)))
     collisionPairs.foreach { case (w1, w2) =>
       map.insert(w1, w2)
       map.insert(w2, w1)
@@ -355,7 +354,7 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
     val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](
       createCombiner, mergeValue, mergeCombiners)
 
-    (1 to 100000).foreach { i => map.insert(i, i) }
+    map.insertAll((1 to 100000).iterator.map(i => (i, i)))
     map.insert(null.asInstanceOf[Int], 1)
     map.insert(1, null.asInstanceOf[Int])
     map.insert(null.asInstanceOf[Int], null.asInstanceOf[Int])
@@ -368,10 +367,3 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
   }
 
 }
-
-/**
- * A dummy class that always returns the same hash code, to easily test hash collisions
- */
-case class FixedHashObject(v: Int, h: Int) extends Serializable {
-  override def hashCode(): Int = h
-}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
new file mode 100644
index 0000000000000..65a71e5a83698
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -0,0 +1,576 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalatest.FunSuite
+
+import org.apache.spark._
+import org.apache.spark.SparkContext._
+
+class ExternalSorterSuite extends FunSuite with LocalSparkContext {
+  test("empty data stream") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+
+    // Both aggregator and ordering
+    val sorter = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
+    assert(sorter.iterator.toSeq === Seq())
+    sorter.stop()
+
+    // Only aggregator
+    val sorter2 = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(3)), None, None)
+    assert(sorter2.iterator.toSeq === Seq())
+    sorter2.stop()
+
+    // Only ordering
+    val sorter3 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    assert(sorter3.iterator.toSeq === Seq())
+    sorter3.stop()
+
+    // Neither aggregator nor ordering
+    val sorter4 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), None, None)
+    assert(sorter4.iterator.toSeq === Seq())
+    sorter4.stop()
+  }
+
+  test("few elements per partition") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+    val elements = Set((1, 1), (2, 2), (5, 5))
+    val expected = Set(
+      (0, Set()), (1, Set((1, 1))), (2, Set((2, 2))), (3, Set()), (4, Set()),
+      (5, Set((5, 5))), (6, Set()))
+
+    // Both aggregator and ordering
+    val sorter = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
+    sorter.write(elements.iterator)
+    assert(sorter.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter.stop()
+
+    // Only aggregator
+    val sorter2 = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(7)), None, None)
+    sorter2.write(elements.iterator)
+    assert(sorter2.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter2.stop()
+
+    // Only ordering
+    val sorter3 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), Some(ord), None)
+    sorter3.write(elements.iterator)
+    assert(sorter3.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter3.stop()
+
+    // Neither aggregator nor ordering
+    val sorter4 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), None, None)
+    sorter4.write(elements.iterator)
+    assert(sorter4.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter4.stop()
+  }
+
+  test("empty partitions with spilling") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+    val elements = Iterator((1, 1), (5, 5)) ++ (0 until 100000).iterator.map(x => (2, 2))
+
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), None, None)
+    sorter.write(elements)
+    assert(sc.env.blockManager.diskBlockManager.getAllFiles().length > 0) // Make sure it spilled
+    val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
+    assert(iter.next() === (0, Nil))
+    assert(iter.next() === (1, List((1, 1))))
+    assert(iter.next() === (2, (0 until 100000).map(x => (2, 2)).toList))
+    assert(iter.next() === (3, Nil))
+    assert(iter.next() === (4, Nil))
+    assert(iter.next() === (5, List((5, 5))))
+    assert(iter.next() === (6, Nil))
+    sorter.stop()
+  }
+
+  test("spilling in local cluster") {
+    val conf = new SparkConf(true)  // Load defaults, otherwise SPARK_HOME is not found
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    // reduceByKey - should spill ~8 times
+    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
+    val resultA = rddA.reduceByKey(math.max).collect()
+    assert(resultA.length == 50000)
+    resultA.foreach { case(k, v) =>
+      if (v != k * 2 + 1) {
+        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
+      }
+    }
+
+    // groupByKey - should spill ~17 times
+    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
+    val resultB = rddB.groupByKey().collect()
+    assert(resultB.length == 25000)
+    resultB.foreach { case(i, seq) =>
+      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
+      if (seq.toSet != expected) {
+        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
+      }
+    }
+
+    // cogroup - should spill ~7 times
+    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
+    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
+    val resultC = rddC1.cogroup(rddC2).collect()
+    assert(resultC.length == 10000)
+    resultC.foreach { case(i, (seq1, seq2)) =>
+      i match {
+        case 0 =>
+          assert(seq1.toSet == Set[Int](0))
+          assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
+        case 1 =>
+          assert(seq1.toSet == Set[Int](1))
+          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
+        case 5000 =>
+          assert(seq1.toSet == Set[Int](5000))
+          assert(seq2.toSet == Set[Int]())
+        case 9999 =>
+          assert(seq1.toSet == Set[Int](9999))
+          assert(seq2.toSet == Set[Int]())
+        case _ =>
+      }
+    }
+
+    // larger cogroup - should spill ~7 times
+    val rddD1 = sc.parallelize(0 until 10000).map(i => (i/2, i))
+    val rddD2 = sc.parallelize(0 until 10000).map(i => (i/2, i))
+    val resultD = rddD1.cogroup(rddD2).collect()
+    assert(resultD.length == 5000)
+    resultD.foreach { case(i, (seq1, seq2)) =>
+      val expected = Set(i * 2, i * 2 + 1)
+      if (seq1.toSet != expected) {
+        fail(s"Value 1 for ${i} was wrong: expected ${expected}, got ${seq1.toSet}")
+      }
+      if (seq2.toSet != expected) {
+        fail(s"Value 2 for ${i} was wrong: expected ${expected}, got ${seq2.toSet}")
+      }
+    }
+
+    // sortByKey - should spill ~17 times
+    val rddE = sc.parallelize(0 until 100000).map(i => (i/4, i))
+    val resultE = rddE.sortByKey().collect().toSeq
+    assert(resultE === (0 until 100000).map(i => (i/4, i)).toSeq)
+  }
+
+  test("spilling in local cluster with many reduce tasks") {
+    val conf = new SparkConf(true)  // Load defaults, otherwise SPARK_HOME is not found
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+
+    // reduceByKey - should spill ~4 times per executor
+    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
+    val resultA = rddA.reduceByKey(math.max _, 100).collect()
+    assert(resultA.length == 50000)
+    resultA.foreach { case(k, v) =>
+      if (v != k * 2 + 1) {
+        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
+      }
+    }
+
+    // groupByKey - should spill ~8 times per executor
+    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
+    val resultB = rddB.groupByKey(100).collect()
+    assert(resultB.length == 25000)
+    resultB.foreach { case(i, seq) =>
+      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
+      if (seq.toSet != expected) {
+        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
+      }
+    }
+
+    // cogroup - should spill ~4 times per executor
+    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
+    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
+    val resultC = rddC1.cogroup(rddC2, 100).collect()
+    assert(resultC.length == 10000)
+    resultC.foreach { case(i, (seq1, seq2)) =>
+      i match {
+        case 0 =>
+          assert(seq1.toSet == Set[Int](0))
+          assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
+        case 1 =>
+          assert(seq1.toSet == Set[Int](1))
+          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
+        case 5000 =>
+          assert(seq1.toSet == Set[Int](5000))
+          assert(seq2.toSet == Set[Int]())
+        case 9999 =>
+          assert(seq1.toSet == Set[Int](9999))
+          assert(seq2.toSet == Set[Int]())
+        case _ =>
+      }
+    }
+
+    // larger cogroup - should spill ~4 times per executor
+    val rddD1 = sc.parallelize(0 until 10000).map(i => (i/2, i))
+    val rddD2 = sc.parallelize(0 until 10000).map(i => (i/2, i))
+    val resultD = rddD1.cogroup(rddD2).collect()
+    assert(resultD.length == 5000)
+    resultD.foreach { case(i, (seq1, seq2)) =>
+      val expected = Set(i * 2, i * 2 + 1)
+      if (seq1.toSet != expected) {
+        fail(s"Value 1 for ${i} was wrong: expected ${expected}, got ${seq1.toSet}")
+      }
+      if (seq2.toSet != expected) {
+        fail(s"Value 2 for ${i} was wrong: expected ${expected}, got ${seq2.toSet}")
+      }
+    }
+
+    // sortByKey - should spill ~8 times per executor
+    val rddE = sc.parallelize(0 until 100000).map(i => (i/4, i))
+    val resultE = rddE.sortByKey().collect().toSeq
+    assert(resultE === (0 until 100000).map(i => (i/4, i)).toSeq)
+  }
+
+  test("cleanup of intermediate files in sorter") {
+    val conf = new SparkConf(true)  // Load defaults, otherwise SPARK_HOME is not found
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
+
+    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
+    sorter.write((0 until 100000).iterator.map(i => (i, i)))
+    assert(diskBlockManager.getAllFiles().length > 0)
+    sorter.stop()
+    assert(diskBlockManager.getAllBlocks().length === 0)
+
+    val sorter2 = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
+    sorter2.write((0 until 100000).iterator.map(i => (i, i)))
+    assert(diskBlockManager.getAllFiles().length > 0)
+    assert(sorter2.iterator.toSet === (0 until 100000).map(i => (i, i)).toSet)
+    sorter2.stop()
+    assert(diskBlockManager.getAllBlocks().length === 0)
+  }
+
+  test("cleanup of intermediate files in sorter if there are errors") {
+    val conf = new SparkConf(true)  // Load defaults, otherwise SPARK_HOME is not found
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
+
+    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
+    intercept[SparkException] {
+      sorter.write((0 until 100000).iterator.map(i => {
+        if (i == 99990) {
+          throw new SparkException("Intentional failure")
+        }
+        (i, i)
+      }))
+    }
+    assert(diskBlockManager.getAllFiles().length > 0)
+    sorter.stop()
+    assert(diskBlockManager.getAllBlocks().length === 0)
+  }
+
+  test("cleanup of intermediate files in shuffle") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
+
+    val data = sc.parallelize(0 until 100000, 2).map(i => (i, i))
+    assert(data.reduceByKey(_ + _).count() === 100000)
+
+    // After the shuffle, there should be only 4 files on disk: our two map output files and
+    // their index files. All other intermediate files should've been deleted.
+    assert(diskBlockManager.getAllFiles().length === 4)
+  }
+
+  test("cleanup of intermediate files in shuffle with errors") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
+
+    val data = sc.parallelize(0 until 100000, 2).map(i => {
+      if (i == 99990) {
+        throw new Exception("Intentional failure")
+      }
+      (i, i)
+    })
+    intercept[SparkException] {
+      data.reduceByKey(_ + _).count()
+    }
+
+    // After the shuffle, there should be only 2 files on disk: the output of task 1 and its index.
+    // All other files (map 2's output and intermediate merge files) should've been deleted.
+    assert(diskBlockManager.getAllFiles().length === 2)
+  }
+
+  test("no partial aggregation or sorting") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
+    sorter.write((0 until 100000).iterator.map(i => (i / 4, i)))
+    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
+    val expected = (0 until 3).map(p => {
+      (p, (0 until 100000).map(i => (i / 4, i)).filter(_._1 % 3 == p).toSet)
+    }).toSet
+    assert(results === expected)
+  }
+
+  test("partial aggregation without spill") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), None, None)
+    sorter.write((0 until 100).iterator.map(i => (i / 2, i)))
+    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
+    val expected = (0 until 3).map(p => {
+      (p, (0 until 50).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
+    }).toSet
+    assert(results === expected)
+  }
+
+  test("partial aggregation with spill, no ordering") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), None, None)
+    sorter.write((0 until 100000).iterator.map(i => (i / 2, i)))
+    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
+    val expected = (0 until 3).map(p => {
+      (p, (0 until 50000).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
+    }).toSet
+    assert(results === expected)
+  }
+
+  test("partial aggregation with spill, with ordering") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
+    sorter.write((0 until 100000).iterator.map(i => (i / 2, i)))
+    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
+    val expected = (0 until 3).map(p => {
+      (p, (0 until 50000).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
+    }).toSet
+    assert(results === expected)
+  }
+
+  test("sorting without aggregation, no spill") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val ord = implicitly[Ordering[Int]]
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    sorter.write((0 until 100).iterator.map(i => (i, i)))
+    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSeq)}.toSeq
+    val expected = (0 until 3).map(p => {
+      (p, (0 until 100).map(i => (i, i)).filter(_._1 % 3 == p).toSeq)
+    }).toSeq
+    assert(results === expected)
+  }
+
+  test("sorting without aggregation, with spill") {
+    val conf = new SparkConf(false)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
+    sc = new SparkContext("local", "test", conf)
+
+    val ord = implicitly[Ordering[Int]]
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    sorter.write((0 until 100000).iterator.map(i => (i, i)))
+    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSeq)}.toSeq
+    val expected = (0 until 3).map(p => {
+      (p, (0 until 100000).map(i => (i, i)).filter(_._1 % 3 == p).toSeq)
+    }).toSeq
+    assert(results === expected)
+  }
+
+  test("spilling with hash collisions") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    def createCombiner(i: String) = ArrayBuffer[String](i)
+    def mergeValue(buffer: ArrayBuffer[String], i: String) = buffer += i
+    def mergeCombiners(buffer1: ArrayBuffer[String], buffer2: ArrayBuffer[String]) =
+      buffer1 ++= buffer2
+
+    val agg = new Aggregator[String, String, ArrayBuffer[String]](
+      createCombiner _, mergeValue _, mergeCombiners _)
+
+    val sorter = new ExternalSorter[String, String, ArrayBuffer[String]](
+      Some(agg), None, None, None)
+
+    val collisionPairs = Seq(
+      ("Aa", "BB"),                   // 2112
+      ("to", "v1"),                   // 3707
+      ("variants", "gelato"),         // -1249574770
+      ("Teheran", "Siblings"),        // 231609873
+      ("misused", "horsemints"),      // 1069518484
+      ("isohel", "epistolaries"),     // -1179291542
+      ("righto", "buzzards"),         // -931102253
+      ("hierarch", "crinolines"),     // -1732884796
+      ("inwork", "hypercatalexes"),   // -1183663690
+      ("wainages", "presentencing"),  // 240183619
+      ("trichothecenes", "locular"),  // 339006536
+      ("pomatoes", "eructation")      // 568647356
+    )
+
+    collisionPairs.foreach { case (w1, w2) =>
+      // String.hashCode is documented to use a specific algorithm, but check just in case
+      assert(w1.hashCode === w2.hashCode)
+    }
+
+    val toInsert = (1 to 100000).iterator.map(_.toString).map(s => (s, s)) ++
+      collisionPairs.iterator ++ collisionPairs.iterator.map(_.swap)
+
+    sorter.write(toInsert)
+
+    // A map of collision pairs in both directions
+    val collisionPairsMap = (collisionPairs ++ collisionPairs.map(_.swap)).toMap
+
+    // Avoid map.size or map.iterator.length because this destructively sorts the underlying map
+    var count = 0
+
+    val it = sorter.iterator
+    while (it.hasNext) {
+      val kv = it.next()
+      val expectedValue = ArrayBuffer[String](collisionPairsMap.getOrElse(kv._1, kv._1))
+      assert(kv._2.equals(expectedValue))
+      count += 1
+    }
+    assert(count === 100000 + collisionPairs.size * 2)
+  }
+
+  test("spilling with many hash collisions") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.0001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
+    val sorter = new ExternalSorter[FixedHashObject, Int, Int](Some(agg), None, None, None)
+
+    // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
+    // problems if the map fails to group together the objects with the same code (SPARK-2043).
+    val toInsert = for (i <- 1 to 10; j <- 1 to 10000) yield (FixedHashObject(j, j % 2), 1)
+    sorter.write(toInsert.iterator)
+
+    val it = sorter.iterator
+    var count = 0
+    while (it.hasNext) {
+      val kv = it.next()
+      assert(kv._2 === 10)
+      count += 1
+    }
+    assert(count === 10000)
+  }
+
+  test("spilling with hash collisions using the Int.MaxValue key") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    def createCombiner(i: Int) = ArrayBuffer[Int](i)
+    def mergeValue(buffer: ArrayBuffer[Int], i: Int) = buffer += i
+    def mergeCombiners(buf1: ArrayBuffer[Int], buf2: ArrayBuffer[Int]) = buf1 ++= buf2
+
+    val agg = new Aggregator[Int, Int, ArrayBuffer[Int]](createCombiner, mergeValue, mergeCombiners)
+    val sorter = new ExternalSorter[Int, Int, ArrayBuffer[Int]](Some(agg), None, None, None)
+
+    sorter.write((1 to 100000).iterator.map(i => (i, i)) ++ Iterator((Int.MaxValue, Int.MaxValue)))
+
+    val it = sorter.iterator
+    while (it.hasNext) {
+      // Should not throw NoSuchElementException
+      it.next()
+    }
+  }
+
+  test("spilling with null keys and values") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    def createCombiner(i: String) = ArrayBuffer[String](i)
+    def mergeValue(buffer: ArrayBuffer[String], i: String) = buffer += i
+    def mergeCombiners(buf1: ArrayBuffer[String], buf2: ArrayBuffer[String]) = buf1 ++= buf2
+
+    val agg = new Aggregator[String, String, ArrayBuffer[String]](
+      createCombiner, mergeValue, mergeCombiners)
+
+    val sorter = new ExternalSorter[String, String, ArrayBuffer[String]](
+      Some(agg), None, None, None)
+
+    sorter.write((1 to 100000).iterator.map(i => (i.toString, i.toString)) ++ Iterator(
+      (null.asInstanceOf[String], "1"),
+      ("1", null.asInstanceOf[String]),
+      (null.asInstanceOf[String], null.asInstanceOf[String])
+    ))
+
+    val it = sorter.iterator
+    while (it.hasNext) {
+      // Should not throw NullPointerException
+      it.next()
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/FixedHashObject.scala b/core/src/test/scala/org/apache/spark/util/collection/FixedHashObject.scala
new file mode 100644
index 0000000000000..c787b5f066e00
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/FixedHashObject.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+/**
+ * A dummy class that always returns the same hash code, to easily test hash collisions
+ */
+case class FixedHashObject(v: Int, h: Int) extends Serializable {
+  override def hashCode(): Int = h
+}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
new file mode 100644
index 0000000000000..1f33967249654
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import scala.reflect.ClassTag
+import scala.util.Random
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.util.SizeEstimator
+
+class SizeTrackerSuite extends FunSuite {
+  val NORMAL_ERROR = 0.20
+  val HIGH_ERROR = 0.30
+
+  import SizeTrackerSuite._
+
+  test("vector fixed size insertions") {
+    testVector[Long](10000, i => i.toLong)
+    testVector[(Long, Long)](10000, i => (i.toLong, i.toLong))
+    testVector[LargeDummyClass](10000, i => new LargeDummyClass)
+  }
+
+  test("vector variable size insertions") {
+    val rand = new Random(123456789)
+    def randString(minLen: Int, maxLen: Int): String = {
+      "a" * (rand.nextInt(maxLen - minLen) + minLen)
+    }
+    testVector[String](10000, i => randString(0, 10))
+    testVector[String](10000, i => randString(0, 100))
+    testVector[String](10000, i => randString(90, 100))
+  }
+
+  test("map fixed size insertions") {
+    testMap[Int, Long](10000, i => (i, i.toLong))
+    testMap[Int, (Long, Long)](10000, i => (i, (i.toLong, i.toLong)))
+    testMap[Int, LargeDummyClass](10000, i => (i, new LargeDummyClass))
+  }
+
+  test("map variable size insertions") {
+    val rand = new Random(123456789)
+    def randString(minLen: Int, maxLen: Int): String = {
+      "a" * (rand.nextInt(maxLen - minLen) + minLen)
+    }
+    testMap[Int, String](10000, i => (i, randString(0, 10)))
+    testMap[Int, String](10000, i => (i, randString(0, 100)))
+    testMap[Int, String](10000, i => (i, randString(90, 100)))
+  }
+
+  test("map updates") {
+    val rand = new Random(123456789)
+    def randString(minLen: Int, maxLen: Int): String = {
+      "a" * (rand.nextInt(maxLen - minLen) + minLen)
+    }
+    testMap[String, Int](10000, i => (randString(0, 10000), i))
+  }
+
+  def testVector[T: ClassTag](numElements: Int, makeElement: Int => T) {
+    val vector = new SizeTrackingVector[T]
+    for (i <- 0 until numElements) {
+      val item = makeElement(i)
+      vector += item
+      expectWithinError(vector, vector.estimateSize(), if (i < 32) HIGH_ERROR else NORMAL_ERROR)
+    }
+  }
+
+  def testMap[K, V](numElements: Int, makeElement: (Int) => (K, V)) {
+    val map = new SizeTrackingAppendOnlyMap[K, V]
+    for (i <- 0 until numElements) {
+      val (k, v) = makeElement(i)
+      map(k) = v
+      expectWithinError(map, map.estimateSize(), if (i < 32) HIGH_ERROR else NORMAL_ERROR)
+    }
+  }
+
+  def expectWithinError(obj: AnyRef, estimatedSize: Long, error: Double) {
+    val betterEstimatedSize = SizeEstimator.estimate(obj)
+    assert(betterEstimatedSize * (1 - error) < estimatedSize,
+      s"Estimated size $estimatedSize was less than expected size $betterEstimatedSize")
+    assert(betterEstimatedSize * (1 + 2 * error) > estimatedSize,
+      s"Estimated size $estimatedSize was greater than expected size $betterEstimatedSize")
+  }
+}
+
+private object SizeTrackerSuite {
+
+  /**
+   * Run speed tests for size tracking collections.
+   */
+  def main(args: Array[String]): Unit = {
+    if (args.size < 1) {
+      println("Usage: SizeTrackerSuite [num elements]")
+      System.exit(1)
+    }
+    val numElements = args(0).toInt
+    vectorSpeedTest(numElements)
+    mapSpeedTest(numElements)
+  }
+
+  /**
+   * Speed test for SizeTrackingVector.
+   *
+   * Results for 100000 elements (possibly non-deterministic):
+   *   PrimitiveVector  15 ms
+   *   SizeTracker      51 ms
+   *   SizeEstimator    2000 ms
+   */
+  def vectorSpeedTest(numElements: Int): Unit = {
+    val baseTimes = for (i <- 0 until 10) yield time {
+      val vector = new PrimitiveVector[LargeDummyClass]
+      for (i <- 0 until numElements) {
+        vector += new LargeDummyClass
+      }
+    }
+    val sampledTimes = for (i <- 0 until 10) yield time {
+      val vector = new SizeTrackingVector[LargeDummyClass]
+      for (i <- 0 until numElements) {
+        vector += new LargeDummyClass
+        vector.estimateSize()
+      }
+    }
+    val unsampledTimes = for (i <- 0 until 3) yield time {
+      val vector = new PrimitiveVector[LargeDummyClass]
+      for (i <- 0 until numElements) {
+        vector += new LargeDummyClass
+        SizeEstimator.estimate(vector)
+      }
+    }
+    printSpeedTestResult("SizeTrackingVector", baseTimes, sampledTimes, unsampledTimes)
+  }
+
+  /**
+   * Speed test for SizeTrackingAppendOnlyMap.
+   *
+   * Results for 100000 elements (possibly non-deterministic):
+   *   AppendOnlyMap  30 ms
+   *   SizeTracker    41 ms
+   *   SizeEstimator  1666 ms
+   */
+  def mapSpeedTest(numElements: Int): Unit = {
+    val baseTimes = for (i <- 0 until 10) yield time {
+      val map = new AppendOnlyMap[Int, LargeDummyClass]
+      for (i <- 0 until numElements) {
+        map(i) = new LargeDummyClass
+      }
+    }
+    val sampledTimes = for (i <- 0 until 10) yield time {
+      val map = new SizeTrackingAppendOnlyMap[Int, LargeDummyClass]
+      for (i <- 0 until numElements) {
+        map(i) = new LargeDummyClass
+        map.estimateSize()
+      }
+    }
+    val unsampledTimes = for (i <- 0 until 3) yield time {
+      val map = new AppendOnlyMap[Int, LargeDummyClass]
+      for (i <- 0 until numElements) {
+        map(i) = new LargeDummyClass
+        SizeEstimator.estimate(map)
+      }
+    }
+    printSpeedTestResult("SizeTrackingAppendOnlyMap", baseTimes, sampledTimes, unsampledTimes)
+  }
+
+  def printSpeedTestResult(
+      testName: String,
+      baseTimes: Seq[Long],
+      sampledTimes: Seq[Long],
+      unsampledTimes: Seq[Long]): Unit = {
+    println(s"Average times for $testName (ms):")
+    println("  Base - " + averageTime(baseTimes))
+    println("  SizeTracker (sampled) - " + averageTime(sampledTimes))
+    println("  SizeEstimator (unsampled) - " + averageTime(unsampledTimes))
+    println()
+  }
+
+  def time(f: => Unit): Long = {
+    val start = System.currentTimeMillis()
+    f
+    System.currentTimeMillis() - start
+  }
+
+  def averageTime(v: Seq[Long]): Long = {
+    v.sum / v.size
+  }
+
+  private class LargeDummyClass {
+    val arr = new Array[Int](100)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
new file mode 100644
index 0000000000000..6fe1079c2719a
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import java.lang.{Float => JFloat}
+import java.util.{Arrays, Comparator}
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.util.random.XORShiftRandom
+
+class SorterSuite extends FunSuite {
+
+  test("equivalent to Arrays.sort") {
+    val rand = new XORShiftRandom(123)
+    val data0 = Array.tabulate[Int](10000) { i => rand.nextInt() }
+    val data1 = data0.clone()
+
+    Arrays.sort(data0)
+    new Sorter(new IntArraySortDataFormat).sort(data1, 0, data1.length, Ordering.Int)
+
+    data0.zip(data1).foreach { case (x, y) => assert(x === y) }
+  }
+
+  test("KVArraySorter") {
+    val rand = new XORShiftRandom(456)
+
+    // Construct an array of keys (to Java sort) and an array where the keys and values
+    // alternate. Keys are random doubles, values are ordinals from 0 to length.
+    val keys = Array.tabulate[Double](5000) { i => rand.nextDouble() }
+    val keyValueArray = Array.tabulate[Number](10000) { i =>
+      if (i % 2 == 0) keys(i / 2) else new Integer(i / 2)
+    }
+
+    // Map from generated keys to values, to verify correctness later
+    val kvMap =
+      keyValueArray.grouped(2).map { case Array(k, v) => k.doubleValue() -> v.intValue() }.toMap
+
+    Arrays.sort(keys)
+    new Sorter(new KVArraySortDataFormat[Double, Number])
+      .sort(keyValueArray, 0, keys.length, Ordering.Double)
+
+    keys.zipWithIndex.foreach { case (k, i) =>
+      assert(k === keyValueArray(2 * i))
+      assert(kvMap(k) === keyValueArray(2 * i + 1))
+    }
+  }
+
+  /**
+   * This provides a simple benchmark for comparing the Sorter with Java internal sorting.
+   * Ideally these would be executed one at a time, each in their own JVM, so their listing
+   * here is mainly to have the code.
+   *
+   * The goal of this code is to sort an array of key-value pairs, where the array physically
+   * has the keys and values alternating. The basic Java sorts work only on the keys, so the
+   * real Java solution is to make Tuple2s to store the keys and values and sort an array of
+   * those, while the Sorter approach can work directly on the input data format.
+   *
+   * Note that the Java implementation varies tremendously between Java 6 and Java 7, when
+   * the Java sort changed from merge sort to Timsort.
+   */
+  ignore("Sorter benchmark") {
+
+    /** Runs an experiment several times. */
+    def runExperiment(name: String)(f: => Unit): Unit = {
+      val firstTry = org.apache.spark.util.Utils.timeIt(1)(f)
+      System.gc()
+
+      var i = 0
+      var next10: Long = 0
+      while (i < 10) {
+        val time = org.apache.spark.util.Utils.timeIt(1)(f)
+        next10 += time
+        println(s"$name: Took $time ms")
+        i += 1
+      }
+
+      println(s"$name: ($firstTry ms first try, ${next10 / 10} ms average)")
+    }
+
+    val numElements = 25000000 // 25 mil
+    val rand = new XORShiftRandom(123)
+
+    val keys = Array.tabulate[JFloat](numElements) { i =>
+      new JFloat(rand.nextFloat())
+    }
+
+    // Test our key-value pairs where each element is a Tuple2[Float, Integer)
+    val kvTupleArray = Array.tabulate[AnyRef](numElements) { i =>
+      (keys(i / 2): Float, i / 2: Int)
+    }
+    runExperiment("Tuple-sort using Arrays.sort()") {
+      Arrays.sort(kvTupleArray, new Comparator[AnyRef] {
+        override def compare(x: AnyRef, y: AnyRef): Int =
+          Ordering.Float.compare(x.asInstanceOf[(Float, _)]._1, y.asInstanceOf[(Float, _)]._1)
+      })
+    }
+
+    // Test our Sorter where each element alternates between Float and Integer, non-primitive
+    val keyValueArray = Array.tabulate[AnyRef](numElements * 2) { i =>
+      if (i % 2 == 0) keys(i / 2) else new Integer(i / 2)
+    }
+    val sorter = new Sorter(new KVArraySortDataFormat[JFloat, AnyRef])
+    runExperiment("KV-sort using Sorter") {
+      sorter.sort(keyValueArray, 0, keys.length, new Comparator[JFloat] {
+        override def compare(x: JFloat, y: JFloat): Int = Ordering.Float.compare(x, y)
+      })
+    }
+
+    // Test non-primitive sort on float array
+    runExperiment("Java Arrays.sort()") {
+      Arrays.sort(keys, new Comparator[JFloat] {
+        override def compare(x: JFloat, y: JFloat): Int = Ordering.Float.compare(x, y)
+      })
+    }
+
+    // Test primitive sort on float array
+    val primitiveKeys = Array.tabulate[Float](numElements) { i => rand.nextFloat() }
+    runExperiment("Java Arrays.sort() on primitive keys") {
+      Arrays.sort(primitiveKeys)
+    }
+  }
+}
+
+
+/** Format to sort a simple Array[Int]. Could be easily generified and specialized. */
+class IntArraySortDataFormat extends SortDataFormat[Int, Array[Int]] {
+  override protected def getKey(data: Array[Int], pos: Int): Int = {
+    data(pos)
+  }
+
+  override protected def swap(data: Array[Int], pos0: Int, pos1: Int): Unit = {
+    val tmp = data(pos0)
+    data(pos0) = data(pos1)
+    data(pos1) = tmp
+  }
+
+  override protected def copyElement(src: Array[Int], srcPos: Int, dst: Array[Int], dstPos: Int) {
+    dst(dstPos) = src(srcPos)
+  }
+
+  /** Copy a range of elements starting at src(srcPos) to dest, starting at destPos. */
+  override protected def copyRange(src: Array[Int], srcPos: Int,
+                                   dst: Array[Int], dstPos: Int, length: Int) {
+    System.arraycopy(src, srcPos, dst, dstPos, length)
+  }
+
+  /** Allocates a new structure that can hold up to 'length' elements. */
+  override protected def allocate(length: Int): Array[Int] = {
+    new Array[Int](length)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
index accfe2e9b7f2a..73a9d029b0248 100644
--- a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
@@ -17,11 +17,32 @@
 
 package org.apache.spark.util.random
 
+import scala.util.Random
+
 import org.apache.commons.math3.distribution.{BinomialDistribution, PoissonDistribution}
 import org.scalatest.FunSuite
 
 class SamplingUtilsSuite extends FunSuite {
 
+  test("reservoirSampleAndCount") {
+    val input = Seq.fill(100)(Random.nextInt())
+
+    // input size < k
+    val (sample1, count1) = SamplingUtils.reservoirSampleAndCount(input.iterator, 150)
+    assert(count1 === 100)
+    assert(input === sample1.toSeq)
+
+    // input size == k
+    val (sample2, count2) = SamplingUtils.reservoirSampleAndCount(input.iterator, 100)
+    assert(count2 === 100)
+    assert(input === sample2.toSeq)
+
+    // input size > k
+    val (sample3, count3) = SamplingUtils.reservoirSampleAndCount(input.iterator, 10)
+    assert(count3 === 100)
+    assert(sample3.length === 10)
+  }
+
   test("computeFraction") {
     // test that the computed fraction guarantees enough data points
     // in the sample with a failure rate <= 0.0001
diff --git a/mllib/data/als/test.data b/data/mllib/als/test.data
similarity index 100%
rename from mllib/data/als/test.data
rename to data/mllib/als/test.data
diff --git a/data/kmeans_data.txt b/data/mllib/kmeans_data.txt
similarity index 100%
rename from data/kmeans_data.txt
rename to data/mllib/kmeans_data.txt
diff --git a/mllib/data/lr-data/random.data b/data/mllib/lr-data/random.data
similarity index 100%
rename from mllib/data/lr-data/random.data
rename to data/mllib/lr-data/random.data
diff --git a/data/lr_data.txt b/data/mllib/lr_data.txt
similarity index 100%
rename from data/lr_data.txt
rename to data/mllib/lr_data.txt
diff --git a/data/pagerank_data.txt b/data/mllib/pagerank_data.txt
similarity index 100%
rename from data/pagerank_data.txt
rename to data/mllib/pagerank_data.txt
diff --git a/mllib/data/ridge-data/lpsa.data b/data/mllib/ridge-data/lpsa.data
similarity index 100%
rename from mllib/data/ridge-data/lpsa.data
rename to data/mllib/ridge-data/lpsa.data
diff --git a/mllib/data/sample_libsvm_data.txt b/data/mllib/sample_libsvm_data.txt
similarity index 100%
rename from mllib/data/sample_libsvm_data.txt
rename to data/mllib/sample_libsvm_data.txt
diff --git a/mllib/data/sample_naive_bayes_data.txt b/data/mllib/sample_naive_bayes_data.txt
similarity index 100%
rename from mllib/data/sample_naive_bayes_data.txt
rename to data/mllib/sample_naive_bayes_data.txt
diff --git a/mllib/data/sample_svm_data.txt b/data/mllib/sample_svm_data.txt
similarity index 100%
rename from mllib/data/sample_svm_data.txt
rename to data/mllib/sample_svm_data.txt
diff --git a/mllib/data/sample_tree_data.csv b/data/mllib/sample_tree_data.csv
similarity index 100%
rename from mllib/data/sample_tree_data.csv
rename to data/mllib/sample_tree_data.csv
diff --git a/dev/audit-release/audit_release.py b/dev/audit-release/audit_release.py
index 230e900ecd4de..16ea1a71290dc 100755
--- a/dev/audit-release/audit_release.py
+++ b/dev/audit-release/audit_release.py
@@ -105,7 +105,7 @@ def get_url(url):
     "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl",
     "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka",
     "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq",
-    "spark-catalyst", "spark-sql", "spark-hive"
+    "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl"
 ]
 modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules)
 
@@ -136,7 +136,7 @@ def ensure_path_not_present(x):
 os.chdir(original_dir)
 
 # SBT application tests
-for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive"]:
+for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]:
     os.chdir(app)
     ret = run_cmd("sbt clean run", exit_on_failure=False)
     test(ret == 0, "sbt application (%s)" % app)
diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
index 77bbd167b199a..fc03fec9866a6 100644
--- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
+++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
@@ -50,5 +50,12 @@ object SimpleApp {
       println("Ganglia sink was loaded via spark-core")
       System.exit(-1)
     }
+
+    // Remove kinesis from default build due to ASL license issue
+    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
+    if (foundKinesis) {
+      println("Kinesis was loaded via spark-core")
+      System.exit(-1)
+    }
   }
 }
diff --git a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
index 7257d17d10116..5111bc0adb772 100644
--- a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
+++ b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.{ListBuffer, Queue}
 import org.apache.spark.SparkConf
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.hive.LocalHiveContext
+import org.apache.spark.sql.hive.HiveContext
 
 case class Person(name: String, age: Int)
 
@@ -34,13 +34,13 @@ object SparkSqlExample {
       case None => new SparkConf().setAppName("Simple Sql App")
     }
     val sc = new SparkContext(conf)
-    val hiveContext = new LocalHiveContext(sc)
+    val hiveContext = new HiveContext(sc)
 
     import hiveContext._
-    hql("DROP TABLE IF EXISTS src")
-    hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-    hql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
-    val results = hql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
+    sql("DROP TABLE IF EXISTS src")
+    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
+    val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
     results.foreach(println)
     
     def test(f: => Boolean, failureMsg: String) = {
diff --git a/dev/audit-release/sbt_app_kinesis/build.sbt b/dev/audit-release/sbt_app_kinesis/build.sbt
new file mode 100644
index 0000000000000..981bc7957b5ed
--- /dev/null
+++ b/dev/audit-release/sbt_app_kinesis/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Kinesis Test"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
new file mode 100644
index 0000000000000..9f85066501472
--- /dev/null
+++ b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main.scala
+
+import scala.util.Try
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+
+object SimpleApp {
+  def main(args: Array[String]) {
+    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
+    if (!foundKinesis) {
+      println("Kinesis not loaded via kinesis-asl")
+      System.exit(-1)
+    }
+  }
+}
diff --git a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
index 50af90c213b5a..d888de929fdda 100644
--- a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
+++ b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
@@ -38,7 +38,7 @@ object SparkSqlExample {
 
     import sqlContext._
     val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x))
-    people.registerAsTable("people")
+    people.registerTempTable("people")
     val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
     val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect()
     teenagerNames.foreach(println)
diff --git a/dev/check-license b/dev/check-license
index fbd2dd465bb18..625ec161bc571 100755
--- a/dev/check-license
+++ b/dev/check-license
@@ -27,23 +27,23 @@ acquire_rat_jar () {
   
   if [[ ! -f "$rat_jar" ]]; then
     # Download rat launch jar if it hasn't been downloaded yet
-    if [ ! -f ${JAR} ]; then
+    if [ ! -f "$JAR" ]; then
     # Download
     printf "Attempting to fetch rat\n"
     JAR_DL=${JAR}.part
     if hash curl 2>/dev/null; then
-      (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
+      (curl --progress-bar ${URL1} > "$JAR_DL" || curl --progress-bar ${URL2} > "$JAR_DL") && mv "$JAR_DL" "$JAR"
     elif hash wget 2>/dev/null; then
-      (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
+      (wget --progress=bar ${URL1} -O "$JAR_DL" || wget --progress=bar ${URL2} -O "$JAR_DL") && mv "$JAR_DL" "$JAR"
     else
       printf "You do not have curl or wget installed, please install rat manually.\n"
       exit -1
     fi
     fi
-    if [ ! -f ${JAR} ]; then
-    # We failed to download
-    printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n"
-    exit -1
+    if [ ! -f "$JAR" ]; then
+      # We failed to download
+      printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n"
+      exit -1
     fi
     printf "Launching rat from ${JAR}\n"
   fi
@@ -51,7 +51,7 @@ acquire_rat_jar () {
 
 # Go to the Spark project root directory
 FWDIR="$(cd `dirname $0`/..; pwd)"
-cd $FWDIR
+cd "$FWDIR"
 
 if test -x "$JAVA_HOME/bin/java"; then
     declare java_cmd="$JAVA_HOME/bin/java"
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 49bf78f60763a..42473629d4f15 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -53,15 +53,15 @@ if [[ ! "$@" =~ --package-only ]]; then
     -Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
     -Dmaven.javadoc.skip=true \
     -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-    -Pyarn -Phive -Phadoop-2.2 -Pspark-ganglia-lgpl\
     -Dtag=$GIT_TAG -DautoVersionSubmodules=true \
+    -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     --batch-mode release:prepare
 
   mvn -DskipTests \
     -Darguments="-DskipTests=true -Dmaven.javadoc.skip=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
     -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
     -Dmaven.javadoc.skip=true \
-    -Pyarn -Phive -Phadoop-2.2 -Pspark-ganglia-lgpl\
+    -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     release:perform
 
   cd ..
@@ -95,7 +95,7 @@ make_binary_release() {
   cp -r spark spark-$RELEASE_VERSION-bin-$NAME
   
   cd spark-$RELEASE_VERSION-bin-$NAME
-  ./make-distribution.sh $FLAGS --name $NAME --tgz
+  ./make-distribution.sh --name $NAME --tgz $FLAGS
   cd ..
   cp spark-$RELEASE_VERSION-bin-$NAME/spark-$RELEASE_VERSION-bin-$NAME.tgz .
   rm -rf spark-$RELEASE_VERSION-bin-$NAME
@@ -111,9 +111,12 @@ make_binary_release() {
     spark-$RELEASE_VERSION-bin-$NAME.tgz.sha
 }
 
-make_binary_release "hadoop1" "--with-hive --hadoop 1.0.4"
-make_binary_release "cdh4" "--with-hive --hadoop 2.0.0-mr1-cdh4.2.0"
-make_binary_release "hadoop2" "--with-hive --with-yarn --hadoop 2.2.0"
+make_binary_release "hadoop1" "-Phive -Phive-thriftserver -Dhadoop.version=1.0.4"
+make_binary_release "cdh4" "-Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0"
+make_binary_release "hadoop2" \
+  "-Phive -Phive-thriftserver -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 -Pyarn.version=2.2.0"
+make_binary_release "hadoop2-without-hive" \
+  "-Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 -Pyarn.version=2.2.0"
 
 # Copy data
 echo "Copying release tarballs"
diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
new file mode 100755
index 0000000000000..8051080117062
--- /dev/null
+++ b/dev/github_jira_sync.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Utility for updating JIRA's with information about Github pull requests
+
+import json
+import os
+import re
+import sys
+import urllib2
+
+try:
+    import jira.client
+except ImportError:
+    print "This tool requires the jira-python library"
+    print "Install using 'sudo pip install jira-python'"
+    sys.exit(-1)
+
+# User facing configs
+GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark")
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "apachespark")
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "XXX")
+# Maximum number of updates to perform in one run
+MAX_UPDATES = int(os.environ.get("MAX_UPDATES", "100000"))
+# Cut-off for oldest PR on which to comment. Useful for avoiding
+# "notification overload" when running for the first time.
+MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496"))
+
+# File used as an opitimization to store maximum previously seen PR
+# Used mostly because accessing ASF JIRA is slow, so we want to avoid checking
+# the state of JIRA's that are tied to PR's we've already looked at.
+MAX_FILE = ".github-jira-max"
+
+def get_url(url):
+    try:
+        return urllib2.urlopen(url)
+    except urllib2.HTTPError as e:
+        print "Unable to fetch URL, exiting: %s" % url
+        sys.exit(-1)
+
+def get_json(urllib_response):
+    return json.load(urllib_response)
+
+# Return a list of (JIRA id, JSON dict) tuples:
+# e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})}
+def get_jira_prs():
+    result = []
+    has_next_page = True
+    page_num = 0
+    while has_next_page:
+	page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num)
+	page_json = get_json(page)
+
+	for pull in page_json:
+	    jiras = re.findall("SPARK-[0-9]{4,5}", pull['title'])
+	    for jira in jiras:
+		result = result + [(jira,  pull)]
+
+	# Check if there is another page
+	link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0]
+	if not "next"in link_header:
+	    has_next_page = False
+	else:
+	    page_num = page_num + 1
+    return result
+
+def set_max_pr(max_val):
+    f = open(MAX_FILE, 'w')
+    f.write("%s" % max_val)
+    f.close()
+    print "Writing largest PR number seen: %s" % max_val
+
+def get_max_pr():
+    if os.path.exists(MAX_FILE):
+        result = int(open(MAX_FILE, 'r').read())
+        print "Read largest PR number previously seen: %s" % result
+        return result
+    else:
+        return 0
+
+jira_client = jira.client.JIRA({'server': JIRA_API_BASE},
+                                basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
+
+jira_prs = get_jira_prs()
+
+previous_max = get_max_pr()
+print "Retrieved %s JIRA PR's from Github" % len(jira_prs)
+jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max]
+print "%s PR's remain after excluding visted ones" % len(jira_prs)
+
+num_updates = 0
+considered = []
+for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])):
+    if num_updates >= MAX_UPDATES:
+      break
+    pr_num = int(pr['number'])
+
+    print "Checking issue %s" % issue
+    considered = considered + [pr_num]
+
+    url = pr['html_url']
+    title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) 
+    try:
+      existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue))
+    except:
+      print "Failure reading JIRA %s (does it exist?)" % issue
+      print sys.exc_info()[0]
+      continue
+
+    if url in existing_links:
+        continue
+
+    icon = {"title": "Pull request #%s" % pr['number'], 
+      "url16x16": "https://assets-cdn.github.com/favicon.ico"}
+    destination = {"title": title, "url": url, "icon": icon}
+    # For all possible fields see:
+    # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links     
+    # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} 
+    jira_client.add_remote_link(issue, destination)
+    
+    comment = "User '%s' has created a pull request for this issue:" % pr['user']['login']
+    comment = comment + ("\n%s" % pr['html_url'])
+    if pr_num >= MIN_COMMENT_PR:
+        jira_client.add_comment(issue, comment)
+    
+    print "Added link %s <-> PR #%s" % (issue, pr['number'])
+    num_updates = num_updates + 1
+
+if len(considered) > 0:
+    set_max_pr(max(considered))
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index c44320239bbbf..53df9b5a3f1d5 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -29,7 +29,6 @@
 import re
 import subprocess
 import sys
-import tempfile
 import urllib2
 
 try:
@@ -39,15 +38,15 @@
     JIRA_IMPORTED = False
 
 # Location of your Spark git development area
-SPARK_HOME = os.environ.get("SPARK_HOME", "/home/patrick/Documents/spark")
+SPARK_HOME = os.environ.get("SPARK_HOME", os.getcwd())
 # Remote name which points to the Gihub site
 PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github")
 # Remote name which points to Apache git
 PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache")
 # ASF JIRA username
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "pwendell")
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "")
 # ASF JIRA password
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "1234")
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "")
 
 GITHUB_BASE = "https://github.com/apache/spark/pull"
 GITHUB_API_BASE = "https://api.github.com/repos/apache/spark"
@@ -129,7 +128,7 @@ def merge_pr(pr_num, target_ref):
     merge_message_flags = []
 
     merge_message_flags += ["-m", title]
-    if body != None:
+    if body is not None:
         # We remove @ symbols from the body to avoid triggering e-mails
         # to people every time someone creates a public fork of Spark.
         merge_message_flags += ["-m", body.replace("@", "")]
@@ -179,7 +178,14 @@ def cherry_pick(pr_num, merge_hash, default_branch):
 
     run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name))
     run_cmd("git checkout %s" % pick_branch_name)
-    run_cmd("git cherry-pick -sx %s" % merge_hash)
+
+    try:
+        run_cmd("git cherry-pick -sx %s" % merge_hash)
+    except Exception as e:
+        msg = "Error cherry-picking: %s\nWould you like to manually fix-up this merge?" % e
+        continue_maybe(msg)
+        msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?"
+        continue_maybe(msg)
 
     continue_maybe("Pick complete (local ref %s). Push to %s?" % (
         pick_branch_name, PUSH_REMOTE_NAME))
@@ -280,6 +286,7 @@ def get_version_json(version_str):
 
 pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ")
 pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num))
+pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num))
 
 url = pr["url"]
 title = pr["title"]
@@ -289,19 +296,23 @@ def get_version_json(version_str):
 base_ref = pr["head"]["ref"]
 pr_repo_desc = "%s/%s" % (user_login, base_ref)
 
-if pr["merged"] is True:
+# Merged pull requests don't appear as merged in the GitHub API;
+# Instead, they're closed by asfgit.
+merge_commits = \
+    [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"]
+
+if merge_commits:
+    merge_hash = merge_commits[0]["commit_id"]
+    message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"]
+
     print "Pull request %s has already been merged, assuming you want to backport" % pr_num
-    merge_commit_desc = run_cmd([
-        'git', 'log', '--merges', '--first-parent',
-        '--grep=pull request #%s' % pr_num, '--oneline']).split("\n")[0]
-    if merge_commit_desc == "":
+    commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify',
+                                    "%s^{commit}" % merge_hash]).strip() != ""
+    if not commit_is_downloaded:
         fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num)
 
-    merge_hash = merge_commit_desc[:7]
-    message = merge_commit_desc[8:]
-
-    print "Found: %s" % message
-    maybe_cherry_pick(pr_num, merge_hash, latest_branch)
+    print "Found commit %s:\n%s" % (merge_hash, message)
+    cherry_pick(pr_num, merge_hash, latest_branch)
     sys.exit(0)
 
 if not bool(pr["mergeable"]):
@@ -323,9 +334,13 @@ def get_version_json(version_str):
     merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)]
 
 if JIRA_IMPORTED:
-    continue_maybe("Would you like to update an associated JIRA?")
-    jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num)
-    resolve_jira(title, merged_refs, jira_comment)
+    if JIRA_USERNAME and JIRA_PASSWORD:
+        continue_maybe("Would you like to update an associated JIRA?")
+        jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num)
+        resolve_jira(title, merged_refs, jira_comment)
+    else:
+        print "JIRA_USERNAME and JIRA_PASSWORD not set"
+        print "Exiting without trying to close the associated JIRA."
 else:
     print "Could not find jira-python library. Run 'sudo pip install jira-python' to install."
     print "Exiting without trying to close the associated JIRA."
diff --git a/dev/mima b/dev/mima
index 7857294f61caf..4c3e65039b160 100755
--- a/dev/mima
+++ b/dev/mima
@@ -22,7 +22,7 @@ set -e
 
 # Go to the Spark project root directory
 FWDIR="$(cd `dirname $0`/..; pwd)"
-cd $FWDIR
+cd "$FWDIR"
 
 echo -e "q\n" | sbt/sbt oldDeps/update
 
diff --git a/dev/run-tests b/dev/run-tests
index d9df020f7563c..d401c90f41d7b 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -19,10 +19,27 @@
 
 # Go to the Spark project root directory
 FWDIR="$(cd `dirname $0`/..; pwd)"
-cd $FWDIR
+cd "$FWDIR"
 
-export SPARK_HADOOP_VERSION=2.3.0
-export SPARK_YARN=true
+if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
+  if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
+    export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
+  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
+    export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
+  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
+    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Dhadoop.version=2.2.0"
+  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
+    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+  fi
+fi
+
+if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
+  export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+fi
+
+export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
+
+echo "SBT_MAVEN_PROFILES_ARGS=\"$SBT_MAVEN_PROFILES_ARGS\""
 
 # Remove work directory
 rm -rf ./work
@@ -38,7 +55,7 @@ JAVA_VERSION=$($java_cmd -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*
 # Partial solution for SPARK-1455. Only run Hive tests if there are sql changes.
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
-  diffs=`git diff --dirstat master | awk '{ print $2; }' | grep "^sql/"`
+  diffs=`git diff --name-only master | grep "^sql/"`
   if [ -n "$diffs" ]; then
     echo "Detected changes in SQL. Will run Hive test suite."
     export _RUN_SQL_TESTS=true # exported for PySpark tests
@@ -62,16 +79,15 @@ dev/scalastyle
 echo "========================================================================="
 echo "Running Spark unit tests"
 echo "========================================================================="
+
+if [ -n "$_RUN_SQL_TESTS" ]; then
+  SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver"
+fi
 # echo "q" is needed because sbt on encountering a build file with failure 
 # (either resolution or compilation) prompts the user for input either q, r, 
 # etc to quit or retry. This echo is there to make it not block.
-if [ -n "$_RUN_SQL_TESTS" ]; then
-  echo -e "q\n" | SPARK_HIVE=true sbt/sbt clean assembly test | \
-    grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-else
-  echo -e "q\n" | sbt/sbt clean assembly test | \
-    grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-fi
+echo -e "q\n" | sbt/sbt $SBT_MAVEN_PROFILES_ARGS clean package assembly/assembly test | \
+  grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
 
 echo "========================================================================="
 echo "Running PySpark tests"
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
new file mode 100755
index 0000000000000..3076eb847b420
--- /dev/null
+++ b/dev/run-tests-jenkins
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Wrapper script that runs the Spark tests then reports QA results
+# to github via its API.
+
+# Go to the Spark project root directory
+FWDIR="$(cd `dirname $0`/..; pwd)"
+cd "$FWDIR"
+
+COMMENTS_URL="https://api.github.com/repos/apache/spark/issues/$ghprbPullId/comments"
+
+function post_message {
+  message=$1
+  data="{\"body\": \"$message\"}"
+  echo "Attempting to post to Github:"
+  echo "$data"
+
+  curl -D- -u x-oauth-basic:$GITHUB_OAUTH_KEY -X POST --data "$data" -H \
+    "Content-Type: application/json" \
+    $COMMENTS_URL | head -n 8
+}
+
+start_message="QA tests have started for PR $ghprbPullId."
+if [ "$sha1" == "$ghprbActualCommit" ]; then
+  start_message="$start_message This patch DID NOT merge cleanly! "
+else
+  start_message="$start_message This patch merges cleanly. "
+fi
+start_message="$start_message<br>View progress: "
+start_message="$start_message${BUILD_URL}consoleFull"
+
+post_message "$start_message"
+
+./dev/run-tests
+test_result="$?"
+
+result_message="QA results for PR $ghprbPullId:<br>"
+
+if [ "$test_result" -eq "0" ]; then
+  result_message="$result_message- This patch PASSES unit tests.<br>"
+else
+  result_message="$result_message- This patch FAILED unit tests.<br>"
+fi
+
+if [ "$sha1" != "$ghprbActualCommit" ]; then
+  result_message="$result_message- This patch merges cleanly<br>"
+  non_test_files=$(git diff master --name-only | grep -v "\/test" | tr "\n" " ")
+  new_public_classes=$(git diff master $non_test_files \
+    | grep -e "trait " -e "class " \
+    | grep -e "{" -e "("  \
+    | grep -v -e \@\@ -e private \
+    | grep \+ \
+    | sed "s/\+ *//" \
+    | tr "\n" "~" \
+    | sed "s/~/<br>/g")
+  if [ "$new_public_classes" == "" ]; then
+    result_message="$result_message- This patch adds no public classes<br>"
+  else
+    result_message="$result_message- This patch adds the following public classes (experimental):<br>"
+    result_message="$result_message$new_public_classes"
+  fi
+fi
+result_message="${result_message}<br>For more information see test ouptut:"
+result_message="${result_message}<br>${BUILD_URL}consoleFull"
+
+post_message "$result_message"
+
+exit $test_result
diff --git a/dev/scalastyle b/dev/scalastyle
index 0e8fd5cc8d64c..d9f2b91a3a091 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -17,12 +17,12 @@
 # limitations under the License.
 #
 
-echo -e "q\n" | SPARK_HIVE=true sbt/sbt scalastyle > scalastyle.txt
+echo -e "q\n" | sbt/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt
 # Check style with YARN alpha built too
-echo -e "q\n" | SPARK_HADOOP_VERSION=0.23.9 SPARK_YARN=true sbt/sbt yarn-alpha/scalastyle \
+echo -e "q\n" | sbt/sbt -Pyarn -Phadoop-0.23 -Dhadoop.version=0.23.9 yarn-alpha/scalastyle \
   >> scalastyle.txt
 # Check style with YARN built too
-echo -e "q\n" | SPARK_HADOOP_VERSION=2.2.0 SPARK_YARN=true sbt/sbt yarn/scalastyle \
+echo -e "q\n" | sbt/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 yarn/scalastyle \
   >> scalastyle.txt
 
 ERRORS=$(cat scalastyle.txt | grep -e "\<error\>")
diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md
index b280df0c8eeb8..7e55131754a3f 100644
--- a/docs/bagel-programming-guide.md
+++ b/docs/bagel-programming-guide.md
@@ -46,7 +46,7 @@ import org.apache.spark.bagel.Bagel._
 Next, we load a sample graph from a text file as a distributed dataset and package it into `PRVertex` objects. We also cache the distributed dataset because Bagel will use it multiple times and we'd like to avoid recomputing it.
 
 {% highlight scala %}
-val input = sc.textFile("data/pagerank_data.txt")
+val input = sc.textFile("data/mllib/pagerank_data.txt")
 
 val numVerts = input.count()
 
diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md
index 55a9e37dfed83..672d0ef114f6d 100644
--- a/docs/building-with-maven.md
+++ b/docs/building-with-maven.md
@@ -98,7 +98,12 @@ mvn -Pyarn-alpha -Phadoop-2.3 -Dhadoop.version=2.3.0 -Dyarn.version=0.23.7 -Dski
 
 # Spark Tests in Maven
 
-Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). Some of the require Spark to be packaged first, so always run `mvn package` with `-DskipTests` the first time. You can then run the tests with `mvn -Dhadoop.version=... test`.
+Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). 
+
+Some of the tests require Spark to be packaged first, so always run `mvn package` with `-DskipTests` the first time.  The following is an example of a correct (build, test) sequence:
+
+    mvn -Pyarn -Phadoop-2.3 -DskipTests -Phive clean package
+    mvn -Pyarn -Phadoop-2.3 -Phive test
 
 The ScalaTest plugin also supports running only a specific test suite as follows:
 
diff --git a/docs/configuration.md b/docs/configuration.md
index b84104cc7e653..2a71d7b820e5f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -42,13 +42,15 @@ val sc = new SparkContext(new SparkConf())
 
 Then, you can supply configuration values at runtime:
 {% highlight bash %}
-./bin/spark-submit --name "My fancy app" --master local[4] myApp.jar
+./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false 
+  --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar 
 {% endhighlight %}
 
 The Spark shell and [`spark-submit`](cluster-overview.html#launching-applications-with-spark-submit)
 tool support two ways to load configurations dynamically. The first are command line options,
-such as `--master`, as shown above. Running `./bin/spark-submit --help` will show the entire list
-of options.
+such as `--master`, as shown above. `spark-submit` can accept any Spark property using the `--conf`
+flag, but uses special flags for properties that play a part in launching the Spark application.
+Running `./bin/spark-submit --help` will show the entire list of these options.
 
 `bin/spark-submit` will also read configuration options from `conf/spark-defaults.conf`, in which
 each line consists of a key and a value separated by whitespace. For example:
@@ -195,6 +197,15 @@ Apart from these, the following properties are also available, and may be useful
     Spark's dependencies and user dependencies. It is currently an experimental feature.
   </td>
 </tr>
+<tr>
+  <td><code>spark.python.worker.memory</code></td>
+  <td>512m</td>
+  <td>
+    Amount of memory to use per python worker process during aggregation, in the same
+    format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
+    used during aggregation goes above this amount, it will spill the data into disks.
+  </td>
+</tr>
 </table>
 
 #### Shuffle Behavior
@@ -228,7 +239,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.shuffle.memoryFraction</code></td>
-  <td>0.3</td>
+  <td>0.2</td>
   <td>
     Fraction of Java heap to use for aggregation and cogroups during shuffles, if
     <code>spark.shuffle.spill</code> is true. At any given time, the collective size of
@@ -336,13 +347,12 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.io.compression.codec</code></td>
-  <td>org.apache.spark.io.<br />LZFCompressionCodec</td>
+  <td>org.apache.spark.io.<br />SnappyCompressionCodec</td>
   <td>
     The codec used to compress internal data such as RDD partitions and shuffle outputs.
-    By default, Spark provides two codecs: <code>org.apache.spark.io.LZFCompressionCodec</code>
-    and <code>org.apache.spark.io.SnappyCompressionCodec</code>. Of these two choices,
-    Snappy offers faster compression and decompression, while LZF offers a better compression
-    ratio.
+    By default, Spark provides three codecs:  <code>org.apache.spark.io.LZ4CompressionCodec</code>,
+    <code>org.apache.spark.io.LZFCompressionCodec</code>,
+    and <code>org.apache.spark.io.SnappyCompressionCodec</code>.
   </td>
 </tr>
 <tr>
@@ -350,7 +360,15 @@ Apart from these, the following properties are also available, and may be useful
   <td>32768</td>
   <td>
     Block size (in bytes) used in Snappy compression, in the case when Snappy compression codec
-    is used.
+    is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.io.compression.lz4.block.size</code></td>
+  <td>32768</td>
+  <td>
+    Block size (in bytes) used in LZ4 compression, in the case when LZ4 compression codec
+    is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.
   </td>
 </tr>
 <tr>
@@ -362,13 +380,13 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.serializer.objectStreamReset</code></td>
-  <td>10000</td>
+  <td>100</td>
   <td>
     When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches
     objects to prevent writing redundant data, however that stops garbage collection of those
     objects. By calling 'reset' you flush that info from the serializer, and allow old
     objects to be collected. To turn off this periodic reset set it to a value &lt;= 0.
-    By default it will reset the serializer every 10,000 objects.
+    By default it will reset the serializer every 100 objects.
   </td>
 </tr>
 <tr>
@@ -381,14 +399,33 @@ Apart from these, the following properties are also available, and may be useful
     case.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kryo.registrationRequired</code></td>
+  <td>false</td>
+  <td>
+    Whether to require registration with Kryo. If set to 'true', Kryo will throw an exception
+    if an unregistered class is serialized. If set to false (the default), Kryo will write
+    unregistered class names along with each object. Writing class names can cause
+    significant performance overhead, so enabling this option can enforce strictly that a
+    user has not omitted classes from registration.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kryoserializer.buffer.mb</code></td>
   <td>2</td>
   <td>
-    Maximum object size to allow within Kryo (the library needs to create a buffer at least as
-    large as the largest single object you'll serialize). Increase this if you get a "buffer limit
-    exceeded" exception inside Kryo. Note that there will be one buffer <i>per core</i> on each
-    worker.
+    Initial size of Kryo's serialization buffer, in megabytes. Note that there will be one buffer
+     <i>per core</i> on each worker. This buffer will grow up to
+     <code>spark.kryoserializer.buffer.max.mb</code> if needed.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryoserializer.buffer.max.mb</code></td>
+  <td>64</td>
+  <td>
+    Maximum allowable size of Kryo serialization buffer, in megabytes. This must be larger than any
+    object you attempt to serialize. Increase this if you get a "buffer limit exceeded" exception
+    inside Kryo.
   </td>
 </tr>
 </table>
@@ -412,7 +449,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.broadcast.factory</code></td>
-  <td>org.apache.spark.broadcast.<br />HttpBroadcastFactory</td>
+  <td>org.apache.spark.broadcast.<br />TorrentBroadcastFactory</td>
   <td>
     Which broadcast implementation to use.
   </td>
@@ -451,6 +488,15 @@ Apart from these, the following properties are also available, and may be useful
     increase it if you configure your own old generation size.
   </td>
 </tr>
+<tr>
+  <td><code>spark.storage.unrollFraction</code></td>
+  <td>0.2</td>
+  <td>
+    Fraction of <code>spark.storage.memoryFraction</code> to use for unrolling blocks in memory.
+    This is dynamically allocated by dropping existing blocks when there is not enough free
+    storage space to unroll the new block in its entirety.
+  </td>
+</tr>
 <tr>
   <td><code>spark.tachyonStore.baseDir</code></td>
   <td>System.getProperty("java.io.tmpdir")</td>
@@ -490,11 +536,18 @@ Apart from these, the following properties are also available, and may be useful
 <tr>
     <td>spark.hadoop.validateOutputSpecs</td>
     <td>true</td>
-    <td>If set to true, validates the output specification (e.g. checking if the output directory already exists) 
-    used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing 
-    output directories. We recommend that users do not disable this except if trying to achieve compatibility with 
+    <td>If set to true, validates the output specification (e.g. checking if the output directory already exists)
+    used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing
+    output directories. We recommend that users do not disable this except if trying to achieve compatibility with
     previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.</td>
 </tr>
+<tr>
+    <td>spark.executor.heartbeatInterval</td>
+    <td>10000</td>
+    <td>Interval (milliseconds) between each executor's heartbeats to the driver.  Heartbeats let
+    the driver know that the executor is still alive and update it with metrics for in-progress
+    tasks.</td>
+</tr>
 </table>
 
 #### Networking
@@ -699,6 +752,25 @@ Apart from these, the following properties are also available, and may be useful
     (in milliseconds)
   </td>
 </tr>
+</tr>
+  <td><code>spark.scheduler.minRegisteredExecutorsRatio</code></td>
+  <td>0</td>
+  <td>
+    The minimum ratio of registered executors (registered executors / total expected executors)
+    to wait for before scheduling begins. Specified as a double between 0 and 1.
+    Regardless of whether the minimum ratio of executors has been reached,
+    the maximum amount of time it will wait before scheduling begins is controlled by config 
+    <code>spark.scheduler.maxRegisteredExecutorsWaitingTime</code> 
+  </td>
+</tr>
+<tr>
+  <td><code>spark.scheduler.maxRegisteredExecutorsWaitingTime</code></td>
+  <td>30000</td>
+  <td>
+    Maximum amount of time to wait for executors to register before scheduling begins
+    (in milliseconds).  
+  </td>
+</tr>
 </table>
 
 #### Security
@@ -773,6 +845,15 @@ Apart from these, the following properties are also available, and may be useful
     into blocks of data before storing them in Spark.
   </td>
 </tr>
+<tr>
+  <td><code>spark.streaming.receiver.maxRate</code></td>
+  <td>infinite</td>
+  <td>
+    Maximum rate (per second) at which each receiver will push data into blocks. Effectively,
+    each stream will consume at most this number of records per second.
+    Setting this configuration to 0 or a negative number will put no limit on the rate.
+  </td>
+</tr>
 <tr>
   <td><code>spark.streaming.unpersist</code></td>
   <td>true</td>
@@ -826,7 +907,7 @@ Apart from these, the following properties are also available, and may be useful
 </table>
 
 #### Cluster Managers
-Each cluster manager in Spark has additional configuration options. Configurations 
+Each cluster manager in Spark has additional configuration options. Configurations
 can be found on the pages for each mode:
 
  * [YARN](running-on-yarn.html#configuration)
diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md
index 32403bc6957a2..ab1023b8f1842 100644
--- a/docs/hadoop-third-party-distributions.md
+++ b/docs/hadoop-third-party-distributions.md
@@ -48,9 +48,9 @@ the _exact_ Hadoop version you are running to avoid any compatibility errors.
   </tr>
 </table>
 
-In SBT, the equivalent can be achieved by setting the SPARK_HADOOP_VERSION flag:
+In SBT, the equivalent can be achieved by setting the the `hadoop.version` property:
 
-    SPARK_HADOOP_VERSION=1.0.4 sbt/sbt assembly
+    sbt/sbt -Dhadoop.version=1.0.4 assembly
 
 # Linking Applications to the Hadoop Version
 
diff --git a/docs/mllib-basics.md b/docs/mllib-basics.md
index 5796e16e8f99c..f9585251fafac 100644
--- a/docs/mllib-basics.md
+++ b/docs/mllib-basics.md
@@ -193,7 +193,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 
-val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 {% endhighlight %}
 </div>
 
@@ -207,7 +207,7 @@ import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.api.java.JavaRDD;
 
 JavaRDD<LabeledPoint> examples = 
-  MLUtils.loadLibSVMFile(jsc.sc(), "mllib/data/sample_libsvm_data.txt").toJavaRDD();
+  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
 {% endhighlight %}
 </div>
 
@@ -218,7 +218,7 @@ examples stored in LIBSVM format.
 {% highlight python %}
 from pyspark.mllib.util import MLUtils
 
-examples = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 {% endhighlight %}
 </div>
 </div>
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 429cdf8d40cec..561de48910132 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -51,7 +51,7 @@ import org.apache.spark.mllib.clustering.KMeans
 import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
-val data = sc.textFile("data/kmeans_data.txt")
+val data = sc.textFile("data/mllib/kmeans_data.txt")
 val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
 
 // Cluster the data into two classes using KMeans
@@ -69,7 +69,54 @@ println("Within Set Sum of Squared Errors = " + WSSSE)
 All of MLlib's methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
-calling `.rdd()` on your `JavaRDD` object.
+calling `.rdd()` on your `JavaRDD` object. A standalone application example
+that is equivalent to the provided example in Scala is given bellow:
+
+{% highlight java %}
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.KMeans;
+import org.apache.spark.mllib.clustering.KMeansModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.SparkConf;
+
+public class KMeansExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("K-means Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse data
+    String path = "data/mllib/kmeans_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+      new Function<String, Vector>() {
+        public Vector call(String s) {
+          String[] sarray = s.split(" ");
+          double[] values = new double[sarray.length];
+          for (int i = 0; i < sarray.length; i++)
+            values[i] = Double.parseDouble(sarray[i]);
+          return Vectors.dense(values);
+        }
+      }
+    );
+
+    // Cluster the data into two classes using KMeans
+    int numClusters = 2;
+    int numIterations = 20;
+    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);
+
+    // Evaluate clustering by computing Within Set Sum of Squared Errors
+    double WSSSE = clusters.computeCost(parsedData.rdd());
+    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+  }
+}
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
 </div>
 
 <div data-lang="python" markdown="1">
@@ -86,7 +133,7 @@ from numpy import array
 from math import sqrt
 
 # Load and parse the data
-data = sc.textFile("data/kmeans_data.txt")
+data = sc.textFile("data/mllib/kmeans_data.txt")
 parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
 
 # Build the model (cluster the data)
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index d51002f015670..0d28b5f7c89b3 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -58,7 +58,7 @@ import org.apache.spark.mllib.recommendation.ALS
 import org.apache.spark.mllib.recommendation.Rating
 
 // Load and parse the data
-val data = sc.textFile("mllib/data/als/test.data")
+val data = sc.textFile("data/mllib/als/test.data")
 val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
     Rating(user.toInt, item.toInt, rate.toDouble)
   })
@@ -99,7 +99,85 @@ val model = ALS.trainImplicit(ratings, rank, numIterations, alpha)
 All of MLlib's methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
-calling `.rdd()` on your `JavaRDD` object.
+calling `.rdd()` on your `JavaRDD` object. A standalone application example
+that is equivalent to the provided example in Scala is given bellow:
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.recommendation.ALS;
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
+import org.apache.spark.mllib.recommendation.Rating;
+import org.apache.spark.SparkConf;
+
+public class CollaborativeFiltering {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/als/test.data";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Rating> ratings = data.map(
+      new Function<String, Rating>() {
+        public Rating call(String s) {
+          String[] sarray = s.split(",");
+          return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), 
+                            Double.parseDouble(sarray[2]));
+        }
+      }
+    );
+
+    // Build the recommendation model using ALS
+    int rank = 10;
+    int numIterations = 20;
+    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); 
+
+    // Evaluate the model on rating data
+    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
+      new Function<Rating, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(Rating r) {
+          return new Tuple2<Object, Object>(r.user(), r.product());
+        }
+      }
+    );
+    JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
+      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Double>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+    ));
+    JavaRDD<Tuple2<Double, Double>> ratesAndPreds = 
+      JavaPairRDD.fromJavaRDD(ratings.map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Double>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+    )).join(predictions).values();
+    double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
+      new Function<Tuple2<Double, Double>, Object>() {
+        public Object call(Tuple2<Double, Double> pair) {
+          Double err = pair._1() - pair._2();
+          return err * err;
+        }
+      }
+    ).rdd()).mean();
+    System.out.println("Mean Squared Error = " + MSE);
+  }
+}
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
 </div>
 
 <div data-lang="python" markdown="1">
@@ -112,7 +190,7 @@ from pyspark.mllib.recommendation import ALS
 from numpy import array
 
 # Load and parse the data
-data = sc.textFile("mllib/data/als/test.data")
+data = sc.textFile("data/mllib/als/test.data")
 ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
 
 # Build the recommendation model using Alternating Least Squares
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 3002a66a4fdb3..9cbd880897578 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -77,15 +77,17 @@ bins if the condition is not satisfied.
 
 **Categorical features**
 
-For `$M$` categorical features, one could come up with `$2^M-1$` split candidates. However, for
-binary classification, the number of split candidates can be reduced to `$M-1$` by ordering the
+For `$M$` categorical feature values, one could come up with `$2^(M-1)-1$` split candidates. For
+binary classification, we can reduce the number of split candidates to `$M-1$` by ordering the
 categorical feature values by the proportion of labels falling in one of the two classes (see
 Section 9.2.4 in
 [Elements of Statistical Machine Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/) for
 details). For example, for a binary classification problem with one categorical feature with three
 categories A, B and C with corresponding proportion of label 1 as 0.2, 0.6 and 0.4, the categorical
 features are ordered as A followed by C followed B or A, B, C. The two split candidates are A \| C, B
-and A , B \| C where \| denotes the split.
+and A , B \| C where \| denotes the split. A similar heuristic is used for multiclass classification
+when `$2^(M-1)-1$` is greater than the number of bins -- the impurity for each categorical feature value
+is used for ordering.
 
 ### Stopping rule
 
@@ -122,7 +124,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.impurity.Gini
 
 // Load and parse the data file
-val data = sc.textFile("mllib/data/sample_tree_data.csv")
+val data = sc.textFile("data/mllib/sample_tree_data.csv")
 val parsedData = data.map { line =>
   val parts = line.split(',').map(_.toDouble)
   LabeledPoint(parts(0), Vectors.dense(parts.tail))
@@ -161,7 +163,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.impurity.Variance
 
 // Load and parse the data file
-val data = sc.textFile("mllib/data/sample_tree_data.csv")
+val data = sc.textFile("data/mllib/sample_tree_data.csv")
 val parsedData = data.map { line =>
   val parts = line.split(',').map(_.toDouble)
   LabeledPoint(parts(0), Vectors.dense(parts.tail))
diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index e3608075fbb13..8e434998c15ea 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -57,10 +57,57 @@ val U: RowMatrix = svd.U // The U factor is a RowMatrix.
 val s: Vector = svd.s // The singular values are stored in a local dense vector.
 val V: Matrix = svd.V // The V factor is a local dense matrix.
 {% endhighlight %}
+
+Same code applies to `IndexedRowMatrix`.
+The only difference that the `U` matrix becomes an `IndexedRowMatrix`.
 </div>
+<div data-lang="java" markdown="1">
+In order to run the following standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
+
+{% highlight java %}
+import java.util.LinkedList;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.SingularValueDecomposition;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class SVD {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("SVD Example");
+    SparkContext sc = new SparkContext(conf);
+     
+    double[][] array = ...
+    LinkedList<Vector> rowsList = new LinkedList<Vector>();
+    for (int i = 0; i < array.length; i++) {
+      Vector currentRow = Vectors.dense(array[i]);
+      rowsList.add(currentRow);
+    }
+    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
+
+    // Create a RowMatrix from JavaRDD<Vector>.
+    RowMatrix mat = new RowMatrix(rows.rdd());
+
+    // Compute the top 4 singular values and corresponding singular vectors.
+    SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(4, true, 1.0E-9d);
+    RowMatrix U = svd.U();
+    Vector s = svd.s();
+    Matrix V = svd.V();
+  }
+}
+{% endhighlight %}
 Same code applies to `IndexedRowMatrix`.
 The only difference that the `U` matrix becomes an `IndexedRowMatrix`.
 </div>
+</div>
 
 ## Principal component analysis (PCA)
 
@@ -91,4 +138,51 @@ val pc: Matrix = mat.computePrincipalComponents(10) // Principal components are
 val projected: RowMatrix = mat.multiply(pc)
 {% endhighlight %}
 </div>
+
+<div data-lang="java" markdown="1">
+
+The following code demonstrates how to compute principal components on a tall-and-skinny `RowMatrix`
+and use them to project the vectors into a low-dimensional space.
+The number of columns should be small, e.g, less than 1000.
+
+{% highlight java %}
+import java.util.LinkedList;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class PCA {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("PCA Example");
+    SparkContext sc = new SparkContext(conf);
+     
+    double[][] array = ...
+    LinkedList<Vector> rowsList = new LinkedList<Vector>();
+    for (int i = 0; i < array.length; i++) {
+      Vector currentRow = Vectors.dense(array[i]);
+      rowsList.add(currentRow);
+    }
+    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
+
+    // Create a RowMatrix from JavaRDD<Vector>.
+    RowMatrix mat = new RowMatrix(rows.rdd());
+
+    // Compute the top 3 principal components.
+    Matrix pc = mat.computePrincipalComponents(3);
+    RowMatrix projected = mat.multiply(pc);
+  }
+}
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
+</div>
 </div>
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 4dfbebbcd04b7..254201147edc1 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -151,10 +151,10 @@ L(\wv;\x,y) :=  \log(1+\exp( -y \wv^T \x)).
 Logistic regression algorithm outputs a logistic regression model, which makes predictions by
 applying the logistic function
 `\[
-\mathrm{logit}(z) = \frac{1}{1 + e^{-z}}
+\mathrm{f}(z) = \frac{1}{1 + e^{-z}}
 \]`
-$\wv^T \x$.
-By default, if $\mathrm{logit}(\wv^T x) > 0.5$, the outcome is positive, or negative otherwise.
+where $z = \wv^T \x$.
+By default, if $\mathrm{f}(\wv^T x) > 0.5$, the outcome is positive, or negative otherwise.
 For the same reason mentioned above, quite often in practice, this default threshold is not a good choice.
 The threshold should be determined via model evaluation.
 
@@ -187,7 +187,7 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLUtils
 
 // Load training data in LIBSVM format.
-val data = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 
 // Split data into training (60%) and test (40%).
 val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
@@ -242,7 +242,86 @@ Similarly, you can use replace `SVMWithSGD` by
 All of MLlib's methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
-calling `.rdd()` on your `JavaRDD` object.
+calling `.rdd()` on your `JavaRDD` object. A standalone application example
+that is equivalent to the provided example in Scala is given bellow:
+
+{% highlight java %}
+import java.util.Random;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.*;
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class SVMClassifier {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("SVM Classifier Example");
+    SparkContext sc = new SparkContext(conf);
+    String path = "data/mllib/sample_libsvm_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+
+    // Split initial RDD into two... [60% training data, 40% testing data].
+    JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
+    training.cache();
+    JavaRDD<LabeledPoint> test = data.subtract(training);
+    
+    // Run training algorithm to build the model.
+    int numIterations = 100;
+    final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations);
+    
+    // Clear the default threshold.
+    model.clearThreshold();
+
+    // Compute raw scores on the test set.
+    JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint p) {
+          Double score = model.predict(p.features());
+          return new Tuple2<Object, Object>(score, p.label());
+        }
+      }
+    );
+    
+    // Get evaluation metrics.
+    BinaryClassificationMetrics metrics = 
+      new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
+    double auROC = metrics.areaUnderROC();
+    
+    System.out.println("Area under ROC = " + auROC);
+  }
+}
+{% endhighlight %}
+
+The `SVMWithSGD.train()` method by default performs L2 regularization with the
+regularization parameter set to 1.0. If we want to configure this algorithm, we
+can customize `SVMWithSGD` further by creating a new object directly and
+calling setter methods. All other MLlib algorithms support customization in
+this way as well. For example, the following code produces an L1 regularized
+variant of SVMs with regularization parameter set to 0.1, and runs the training
+algorithm for 200 iterations.
+
+{% highlight java %}
+import org.apache.spark.mllib.optimization.L1Updater;
+
+SVMWithSGD svmAlg = new SVMWithSGD();
+svmAlg.optimizer()
+  .setNumIterations(200)
+  .setRegParam(0.1)
+  .setUpdater(new L1Updater());
+final SVMModel modelL1 = svmAlg.run(training.rdd());
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
 </div>
 
 <div data-lang="python" markdown="1">
@@ -259,7 +338,7 @@ def parsePoint(line):
     values = [float(x) for x in line.split(' ')]
     return LabeledPoint(values[0], values[1:])
 
-data = sc.textFile("mllib/data/sample_svm_data.txt")
+data = sc.textFile("data/mllib/sample_svm_data.txt")
 parsedData = data.map(parsePoint)
 
 # Build the model
@@ -309,7 +388,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
-val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
 val parsedData = data.map { line =>
   val parts = line.split(',')
   LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
@@ -338,7 +417,72 @@ and [`LassoWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.Lass
 All of MLlib's methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
-calling `.rdd()` on your `JavaRDD` object.
+calling `.rdd()` on your `JavaRDD` object. The corresponding Java example to
+the Scala snippet provided, is presented bellow:
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.regression.LinearRegressionModel;
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
+import org.apache.spark.SparkConf;
+
+public class LinearRegression {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+    
+    // Load and parse the data
+    String path = "data/mllib/ridge-data/lpsa.data";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<LabeledPoint> parsedData = data.map(
+      new Function<String, LabeledPoint>() {
+        public LabeledPoint call(String line) {
+          String[] parts = line.split(",");
+          String[] features = parts[1].split(" ");
+          double[] v = new double[features.length];
+          for (int i = 0; i < features.length - 1; i++)
+            v[i] = Double.parseDouble(features[i]);
+          return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
+        }
+      }
+    );
+
+    // Building the model
+    int numIterations = 100;
+    final LinearRegressionModel model = 
+      LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
+
+    // Evaluate model on training examples and compute training error
+    JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
+      new Function<LabeledPoint, Tuple2<Double, Double>>() {
+        public Tuple2<Double, Double> call(LabeledPoint point) {
+          double prediction = model.predict(point.features());
+          return new Tuple2<Double, Double>(prediction, point.label());
+        }
+      }
+    );
+    JavaRDD<Object> MSE = new JavaDoubleRDD(valuesAndPreds.map(
+      new Function<Tuple2<Double, Double>, Object>() {
+        public Object call(Tuple2<Double, Double> pair) {
+          return Math.pow(pair._1() - pair._2(), 2.0);
+        }
+      }
+    ).rdd()).mean();
+    System.out.println("training Mean Squared Error = " + MSE);
+  }
+}
+{% endhighlight %}
+
+In order to run the above standalone application using Spark framework make
+sure that you follow the instructions provided at section [Standalone
+Applications](quick-start.html) of the quick-start guide. What is more, you
+should include to your build file *spark-mllib* as a dependency.
 </div>
 
 <div data-lang="python" markdown="1">
@@ -356,7 +500,7 @@ def parsePoint(line):
     values = [float(x) for x in line.replace(',', ' ').split(' ')]
     return LabeledPoint(values[0], values[1:])
 
-data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+data = sc.textFile("data/mllib/ridge-data/lpsa.data")
 parsedData = data.map(parsePoint)
 
 # Build the model
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 1d1d7dcf6ffcb..b1650c83c98b9 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -40,7 +40,7 @@ import org.apache.spark.mllib.classification.NaiveBayes
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 
-val data = sc.textFile("mllib/data/sample_naive_bayes_data.txt")
+val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
 val parsedData = data.map { line =>
   val parts = line.split(',')
   LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index ae9ede58e8e60..26ce5f3c501ff 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -207,6 +207,10 @@ the loss computed for every iteration.
 
 Here is an example to train binary logistic regression with L2 regularization using
 L-BFGS optimizer. 
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
@@ -214,7 +218,7 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.mllib.classification.LogisticRegressionModel
 
-val data = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 val numFeatures = data.take(1)(0).features.size
 
 // Split data into training (60%) and test (40%).
@@ -263,7 +267,97 @@ println("Loss of each step in training process")
 loss.foreach(println)
 println("Area under ROC = " + auROC)
 {% endhighlight %}
-
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import java.util.Arrays;
+import java.util.Random;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.optimization.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class LBFGSExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("L-BFGS Example");
+    SparkContext sc = new SparkContext(conf);
+    String path = "data/mllib/sample_libsvm_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+    int numFeatures = data.take(1).get(0).features().size();
+    
+    // Split initial RDD into two... [60% training data, 40% testing data].
+    JavaRDD<LabeledPoint> trainingInit = data.sample(false, 0.6, 11L);
+    JavaRDD<LabeledPoint> test = data.subtract(trainingInit);
+    
+    // Append 1 into the training data as intercept.
+    JavaRDD<Tuple2<Object, Vector>> training = data.map(
+      new Function<LabeledPoint, Tuple2<Object, Vector>>() {
+        public Tuple2<Object, Vector> call(LabeledPoint p) {
+          return new Tuple2<Object, Vector>(p.label(), MLUtils.appendBias(p.features()));
+        }
+      });
+    training.cache();
+
+    // Run training algorithm to build the model.
+    int numCorrections = 10;
+    double convergenceTol = 1e-4;
+    int maxNumIterations = 20;
+    double regParam = 0.1;
+    Vector initialWeightsWithIntercept = Vectors.dense(new double[numFeatures + 1]);
+
+    Tuple2<Vector, double[]> result = LBFGS.runLBFGS(
+      training.rdd(),
+      new LogisticGradient(),
+      new SquaredL2Updater(),
+      numCorrections,
+      convergenceTol,
+      maxNumIterations,
+      regParam,
+      initialWeightsWithIntercept);
+    Vector weightsWithIntercept = result._1();
+    double[] loss = result._2();
+
+    final LogisticRegressionModel model = new LogisticRegressionModel(
+      Vectors.dense(Arrays.copyOf(weightsWithIntercept.toArray(), weightsWithIntercept.size() - 1)),
+      (weightsWithIntercept.toArray())[weightsWithIntercept.size() - 1]);
+
+    // Clear the default threshold.
+    model.clearThreshold();
+
+    // Compute raw scores on the test set.
+    JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+      public Tuple2<Object, Object> call(LabeledPoint p) {
+        Double score = model.predict(p.features());
+        return new Tuple2<Object, Object>(score, p.label());
+      }
+    });
+
+    // Get evaluation metrics.
+    BinaryClassificationMetrics metrics = 
+      new BinaryClassificationMetrics(scoreAndLabels.rdd());
+    double auROC = metrics.areaUnderROC();
+     
+    System.out.println("Loss of each step in training process");
+    for (double l : loss)
+      System.out.println(l);
+    System.out.println("Area under ROC = " + auROC);
+  }
+}
+{% endhighlight %}
+</div>
+</div>
 #### Developer's note
 Since the Hessian is constructed approximately from previous gradient evaluations, 
 the objective function can not be changed during the optimization process. 
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 84073fe4d949a..d07ec4a57a2cc 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -33,7 +33,7 @@ application's UI after the application has finished.
 
 If Spark is run on Mesos or YARN, it is still possible to reconstruct the UI of a finished
 application through Spark's history server, provided that the application's event logs exist.
-You can start a the history server by executing:
+You can start the history server by executing:
 
     ./sbin/start-history-server.sh
 
@@ -106,7 +106,7 @@ follows:
     <td>
       Indicates whether the history server should use kerberos to login. This is useful
       if the history server is accessing HDFS files on a secure Hadoop cluster. If this is 
-      true it looks uses the configs <code>spark.history.kerberos.principal</code> and
+      true, it uses the configs <code>spark.history.kerberos.principal</code> and
       <code>spark.history.kerberos.keytab</code>. 
     </td>
   </tr>
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 403438a3e43ec..d329bd9c1aa9f 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -383,16 +383,16 @@ Apart from text files, Spark's Python API also supports several other data forma
 
 * `RDD.saveAsPickleFile` and `SparkContext.pickleFile` support saving an RDD in a simple format consisting of pickled Python objects. Batching is used on pickle serialization, with default batch size 10.
 
-* Details on reading `SequenceFile` and arbitrary Hadoop `InputFormat` are given below.
-
-### SequenceFile and Hadoop InputFormats
+* SequenceFile and Hadoop Input/Output Formats
 
 **Note** this feature is currently marked ```Experimental``` and is intended for advanced users. It may be replaced in future with read/write support based on SparkSQL, in which case SparkSQL is the preferred approach.
 
-#### Writable Support
+**Writable Support**
 
-PySpark SequenceFile support loads an RDD within Java, and pickles the resulting Java objects using
-[Pyrolite](https://github.com/irmen/Pyrolite/). The following Writables are automatically converted:
+PySpark SequenceFile support loads an RDD of key-value pairs within Java, converts Writables to base Java types, and pickles the 
+resulting Java objects using [Pyrolite](https://github.com/irmen/Pyrolite/). When saving an RDD of key-value pairs to SequenceFile, 
+PySpark does the reverse. It unpickles Python objects into Java objects and then converts them to Writables. The following 
+Writables are automatically converted:
 
 <table class="table">
 <tr><th>Writable Type</th><th>Python Type</th></tr>
@@ -403,32 +403,30 @@ PySpark SequenceFile support loads an RDD within Java, and pickles the resulting
 <tr><td>BooleanWritable</td><td>bool</td></tr>
 <tr><td>BytesWritable</td><td>bytearray</td></tr>
 <tr><td>NullWritable</td><td>None</td></tr>
-<tr><td>ArrayWritable</td><td>list of primitives, or tuple of objects</td></tr>
 <tr><td>MapWritable</td><td>dict</td></tr>
-<tr><td>Custom Class conforming to Java Bean conventions</td>
-    <td>dict of public properties (via JavaBean getters and setters) + __class__ for the class type</td></tr>
 </table>
 
-#### Loading SequenceFiles
+Arrays are not handled out-of-the-box. Users need to specify custom `ArrayWritable` subtypes when reading or writing. When writing, 
+users also need to specify custom converters that convert arrays to custom `ArrayWritable` subtypes. When reading, the default 
+converter will convert custom `ArrayWritable` subtypes to Java `Object[]`, which then get pickled to Python tuples. To get 
+Python `array.array` for arrays of primitive types, users need to specify custom converters.
+
+**Saving and Loading SequenceFiles**
 
-Similarly to text files, SequenceFiles can be loaded by specifying the path. The key and value
+Similarly to text files, SequenceFiles can be saved and loaded by specifying the path. The key and value
 classes can be specified, but for standard Writables this is not required.
 
 {% highlight python %}
->>> rdd = sc.sequenceFile("path/to/sequencefile/of/doubles")
->>> rdd.collect()         # this example has DoubleWritable keys and Text values
-[(1.0, u'aa'),
- (2.0, u'bb'),
- (2.0, u'aa'),
- (3.0, u'cc'),
- (2.0, u'bb'),
- (1.0, u'aa')]
+>>> rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x ))
+>>> rdd.saveAsSequenceFile("path/to/file")
+>>> sorted(sc.sequenceFile("path/to/file").collect())
+[(1, u'a'), (2, u'aa'), (3, u'aaa')]
 {% endhighlight %}
 
-#### Loading Other Hadoop InputFormats
+**Saving and Loading Other Hadoop Input/Output Formats**
 
-PySpark can also read any Hadoop InputFormat, for both 'new' and 'old' Hadoop APIs. If required,
-a Hadoop configuration can be passed in as a Python dict. Here is an example using the
+PySpark can also read any Hadoop InputFormat or write any Hadoop OutputFormat, for both 'new' and 'old' Hadoop MapReduce APIs. 
+If required, a Hadoop configuration can be passed in as a Python dict. Here is an example using the
 Elasticsearch ESInputFormat:
 
 {% highlight python %}
@@ -447,8 +445,7 @@ Note that, if the InputFormat simply depends on a Hadoop configuration and/or in
 the key and value classes can easily be converted according to the above table,
 then this approach should work well for such cases.
 
-If you have custom serialized binary data (such as loading data from Cassandra / HBase) or custom
-classes that don't conform to the JavaBean requirements, then you will first need to 
+If you have custom serialized binary data (such as loading data from Cassandra / HBase), then you will first need to 
 transform that data on the Scala/Java side to something which can be handled by Pyrolite's pickler.
 A [Converter](api/scala/index.html#org.apache.spark.api.python.Converter) trait is provided 
 for this. Simply extend this trait and implement your transformation code in the ```convert``` 
@@ -456,11 +453,8 @@ method. Remember to ensure that this class, along with any dependencies required
 classpath.
 
 See the [Python examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python) and 
-the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/pythonconverters) 
-for examples of using HBase and Cassandra ```InputFormat```.
-
-Future support for writing data out as ```SequenceFileOutputFormat``` and other ```OutputFormats```, 
-is forthcoming.
+the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/pythonconverters) 
+for examples of using Cassandra / HBase ```InputFormat``` and ```OutputFormat``` with custom converters.
 
 </div>
 
@@ -739,7 +733,7 @@ def doStuff(self, rdd):
 
 While most Spark operations work on RDDs containing any type of objects, a few special operations are
 only available on RDDs of key-value pairs.
-The most common ones are distibuted "shuffle" operations, such as grouping or aggregating the elements
+The most common ones are distributed "shuffle" operations, such as grouping or aggregating the elements
 by a key.
 
 In Scala, these operations are automatically available on RDDs containing
@@ -773,7 +767,7 @@ documentation](http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html#ha
 
 While most Spark operations work on RDDs containing any type of objects, a few special operations are
 only available on RDDs of key-value pairs.
-The most common ones are distibuted "shuffle" operations, such as grouping or aggregating the elements
+The most common ones are distributed "shuffle" operations, such as grouping or aggregating the elements
 by a key.
 
 In Java, key-value pairs are represented using the 
@@ -810,7 +804,7 @@ documentation](http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html#ha
 
 While most Spark operations work on RDDs containing any type of objects, a few special operations are
 only available on RDDs of key-value pairs.
-The most common ones are distibuted "shuffle" operations, such as grouping or aggregating the elements
+The most common ones are distributed "shuffle" operations, such as grouping or aggregating the elements
 by a key.
 
 In Python, these operations work on RDDs containing built-in Python tuples such as `(1, 2)`.
@@ -975,8 +969,8 @@ for details.
   <td> Return an array with the first <i>n</i> elements of the dataset. Note that this is currently not executed in parallel. Instead, the driver program computes all the elements. </td>
 </tr>
 <tr>
-  <td> <b>takeSample</b>(<i>withReplacement</i>, <i>num</i>, <i>seed</i>) </td>
-  <td> Return an array with a random sample of <i>num</i> elements of the dataset, with or without replacement, using the given random number generator seed. </td>
+  <td> <b>takeSample</b>(<i>withReplacement</i>, <i>num</i>, [<i>seed</i>]) </td>
+  <td> Return an array with a random sample of <i>num</i> elements of the dataset, with or without replacement, optionally pre-specifying a random number generator seed.</td>
 </tr>
 <tr>
   <td> <b>takeOrdered</b>(<i>n</i>, <i>[ordering]</i>) </td>
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 5d8d603aa3e37..0362f5a223319 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -55,7 +55,7 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
 </tr>
 <tr>
   <td><code>spark.yarn.max.executor.failures</code></td>
-  <td>2*numExecutors</td>
+  <td>numExecutors * 2, with minimum of 3</td>
   <td>
     The maximum number of executor failures before failing the application.
   </td>
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index f5c0f7cef83d2..2fb30765f35e8 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -156,6 +156,20 @@ SPARK_MASTER_OPTS supports the following system properties:
 
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.deploy.retainedApplications</code></td>
+  <td>200</td>
+  <td>
+    The maximum number of completed applications to display. Older applications will be dropped from the UI to maintain this limit.<br/>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.deploy.retainedDrivers</code></td>
+  <td>200</td>
+  <td>
+   The maximum number of completed drivers to display. Older drivers will be dropped from the UI to maintain this limit.<br/>
+  </td>
+</tr>
 <tr>
   <td><code>spark.deploy.spreadOut</code></td>
   <td>true</td>
@@ -228,9 +242,6 @@ To run an interactive Spark shell against the cluster, run the following command
 
     ./bin/spark-shell --master spark://IP:PORT
 
-Note that if you are running spark-shell from one of the spark cluster machines, the `bin/spark-shell` script will
-automatically set MASTER from the `SPARK_MASTER_IP` and `SPARK_MASTER_PORT` variables in `conf/spark-env.sh`.
-
 You can also pass an option `--cores <numCores>` to control the number of cores that spark-shell uses on the cluster.
 
 # Launching Compiled Spark Applications
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 522c83884ef42..cd6543945c385 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -136,13 +136,13 @@ val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 import sqlContext.createSchemaRDD
 
 // Define the schema using a case class.
-// Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit, 
+// Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,
 // you can use custom classes that implement the Product interface.
 case class Person(name: String, age: Int)
 
 // Create an RDD of Person objects and register it as a table.
 val people = sc.textFile("examples/src/main/resources/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
-people.registerAsTable("people")
+people.registerTempTable("people")
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -210,7 +210,7 @@ JavaRDD<Person> people = sc.textFile("examples/src/main/resources/people.txt").m
 
 // Apply a schema to an RDD of JavaBeans and register it as a table.
 JavaSchemaRDD schemaPeople = sqlContext.applySchema(people, Person.class);
-schemaPeople.registerAsTable("people");
+schemaPeople.registerTempTable("people");
 
 // SQL can be run over RDDs that have been registered as tables.
 JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -248,7 +248,7 @@ people = parts.map(lambda p: {"name": p[0], "age": int(p[1])})
 # In future versions of PySpark we would like to add support for registering RDDs with other
 # datatypes as tables
 schemaPeople = sqlContext.inferSchema(people)
-schemaPeople.registerAsTable("people")
+schemaPeople.registerTempTable("people")
 
 # SQL can be run over SchemaRDDs that have been registered as a table.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -292,7 +292,7 @@ people.saveAsParquetFile("people.parquet")
 val parquetFile = sqlContext.parquetFile("people.parquet")
 
 //Parquet files can also be registered as tables and then used in SQL statements.
-parquetFile.registerAsTable("parquetFile")
+parquetFile.registerTempTable("parquetFile")
 val teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
 teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 {% endhighlight %}
@@ -314,7 +314,7 @@ schemaPeople.saveAsParquetFile("people.parquet");
 JavaSchemaRDD parquetFile = sqlContext.parquetFile("people.parquet");
 
 //Parquet files can also be registered as tables and then used in SQL statements.
-parquetFile.registerAsTable("parquetFile");
+parquetFile.registerTempTable("parquetFile");
 JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
 List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
   public String call(Row row) {
@@ -340,7 +340,7 @@ schemaPeople.saveAsParquetFile("people.parquet")
 parquetFile = sqlContext.parquetFile("people.parquet")
 
 # Parquet files can also be registered as tables and then used in SQL statements.
-parquetFile.registerAsTable("parquetFile");
+parquetFile.registerTempTable("parquetFile");
 teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
 for teenName in teenNames.collect():
@@ -378,7 +378,7 @@ people.printSchema()
 //  |-- name: StringType
 
 // Register this SchemaRDD as a table.
-people.registerAsTable("people")
+people.registerTempTable("people")
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -416,7 +416,7 @@ people.printSchema();
 //  |-- name: StringType
 
 // Register this JavaSchemaRDD as a table.
-people.registerAsTable("people");
+people.registerTempTable("people");
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
@@ -455,7 +455,7 @@ people.printSchema()
 #  |-- name: StringType
 
 # Register this SchemaRDD as a table.
-people.registerAsTable("people")
+people.registerTempTable("people")
 
 # SQL statements can be run by using the sql methods provided by sqlContext.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -474,7 +474,7 @@ anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
 
 Spark SQL also supports reading and writing data stored in [Apache Hive](http://hive.apache.org/).
 However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
-In order to use Hive you must first run '`SPARK_HIVE=true sbt/sbt assembly/assembly`' (or use `-Phive` for maven).
+In order to use Hive you must first run '`sbt/sbt -Phive assembly/assembly`' (or use `-Phive` for maven).
 This command builds a new assembly jar that includes Hive. Note that this Hive assembly jar must also be present
 on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
 (SerDes) in order to acccess data stored in Hive.
@@ -487,19 +487,19 @@ Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
 
 When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
 adds support for finding tables in in the MetaStore and writing queries using HiveQL. Users who do
-not have an existing Hive deployment can also experiment with the `LocalHiveContext`,
-which is similar to `HiveContext`, but creates a local copy of the `metastore` and `warehouse`
-automatically.
+not have an existing Hive deployment can still create a HiveContext.  When not configured by the
+hive-site.xml, the context automatically creates `metastore_db` and `warehouse` in the current
+directory.
 
 {% highlight scala %}
 // sc is an existing SparkContext.
 val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
 
-hiveContext.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-hiveContext.hql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+hiveContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+hiveContext.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
 
 // Queries are expressed in HiveQL
-hiveContext.hql("FROM src SELECT key, value").collect().foreach(println)
+hiveContext.sql("FROM src SELECT key, value").collect().foreach(println)
 {% endhighlight %}
 
 </div>
@@ -515,11 +515,11 @@ expressed in HiveQL.
 // sc is an existing JavaSparkContext.
 JavaHiveContext hiveContext = new org.apache.spark.sql.hive.api.java.HiveContext(sc);
 
-hiveContext.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
-hiveContext.hql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");
+hiveContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
+hiveContext.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");
 
 // Queries are expressed in HiveQL.
-Row[] results = hiveContext.hql("FROM src SELECT key, value").collect();
+Row[] results = hiveContext.sql("FROM src SELECT key, value").collect();
 
 {% endhighlight %}
 
@@ -537,18 +537,17 @@ expressed in HiveQL.
 from pyspark.sql import HiveContext
 hiveContext = HiveContext(sc)
 
-hiveContext.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-hiveContext.hql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+hiveContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+hiveContext.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
 
 # Queries can be expressed in HiveQL.
-results = hiveContext.hql("FROM src SELECT key, value").collect()
+results = hiveContext.sql("FROM src SELECT key, value").collect()
 
 {% endhighlight %}
 
 </div>
 </div>
 
-
 # Writing Language-Integrated Relational Queries
 
 **Language-Integrated queries are currently only supported in Scala.**
@@ -573,4 +572,210 @@ prefixed with a tick (`'`).  Implicit conversions turn these symbols into expres
 evaluated by the SQL execution engine.  A full list of the functions supported can be found in the
 [ScalaDoc](api/scala/index.html#org.apache.spark.sql.SchemaRDD).
 
-<!-- TODO: Include the table of operations here. -->
\ No newline at end of file
+<!-- TODO: Include the table of operations here. -->
+
+## Running the Thrift JDBC server
+
+The Thrift JDBC server implemented here corresponds to the [`HiveServer2`]
+(https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2) in Hive 0.12. You can test
+the JDBC server with the beeline script comes with either Spark or Hive 0.12.  In order to use Hive
+you must first run '`sbt/sbt -Phive-thriftserver assembly/assembly`' (or use `-Phive-thriftserver`
+for maven).
+
+To start the JDBC server, run the following in the Spark directory:
+
+    ./sbin/start-thriftserver.sh
+
+The default port the server listens on is 10000.  To listen on customized host and port, please set
+the `HIVE_SERVER2_THRIFT_PORT` and `HIVE_SERVER2_THRIFT_BIND_HOST` environment variables. You may
+run `./sbin/start-thriftserver.sh --help` for a complete list of all available options.  Now you can
+use beeline to test the Thrift JDBC server:
+
+    ./bin/beeline
+
+Connect to the JDBC server in beeline with:
+
+    beeline> !connect jdbc:hive2://localhost:10000
+
+Beeline will ask you for a username and password. In non-secure mode, simply enter the username on
+your machine and a blank password. For secure mode, please follow the instructions given in the
+[beeline documentation](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients)
+
+Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
+
+You may also use the beeline script comes with Hive.
+
+### Migration Guide for Shark Users
+
+#### Reducer number
+
+In Shark, default reducer number is 1 and is controlled by the property `mapred.reduce.tasks`. Spark
+SQL deprecates this property by a new property `spark.sql.shuffle.partitions`, whose default value
+is 200. Users may customize this property via `SET`:
+
+```
+SET spark.sql.shuffle.partitions=10;
+SELECT page, count(*) c FROM logs_last_month_cached
+GROUP BY page ORDER BY c DESC LIMIT 10;
+```
+
+You may also put this property in `hive-site.xml` to override the default value.
+
+For now, the `mapred.reduce.tasks` property is still recognized, and is converted to
+`spark.sql.shuffle.partitions` automatically.
+
+#### Caching
+
+The `shark.cache` table property no longer exists, and tables whose name end with `_cached` are no
+longer automcatically cached. Instead, we provide `CACHE TABLE` and `UNCACHE TABLE` statements to
+let user control table caching explicitly:
+
+```
+CACHE TABLE logs_last_month;
+UNCACHE TABLE logs_last_month;
+```
+
+**NOTE** `CACHE TABLE tbl` is lazy, it only marks table `tbl` as "need to by cached if necessary",
+but doesn't actually cache it until a query that touches `tbl` is executed. To force the table to be
+cached, you may simply count the table immediately after executing `CACHE TABLE`:
+
+```
+CACHE TABLE logs_last_month;
+SELECT COUNT(1) FROM logs_last_month;
+```
+
+Several caching related features are not supported yet:
+
+* User defined partition level cache eviction policy
+* RDD reloading
+* In-memory cache write through policy
+
+### Compatibility with Apache Hive
+
+#### Deploying in Exising Hive Warehouses
+
+Spark SQL Thrift JDBC server is designed to be "out of the box" compatible with existing Hive
+installations. You do not need to modify your existing Hive Metastore or change the data placement
+or partitioning of your tables.
+
+#### Supported Hive Features
+
+Spark SQL supports the vast majority of Hive features, such as:
+
+* Hive query statements, including:
+ * `SELECT`
+ * `GROUP BY
+ * `ORDER BY`
+ * `CLUSTER BY`
+ * `SORT BY`
+* All Hive operators, including:
+ * Relational operators (`=`, `⇔`, `==`, `<>`, `<`, `>`, `>=`, `<=`, etc)
+ * Arthimatic operators (`+`, `-`, `*`, `/`, `%`, etc)
+ * Logical operators (`AND`, `&&`, `OR`, `||`, etc)
+ * Complex type constructors
+ * Mathemtatical functions (`sign`, `ln`, `cos`, etc)
+ * String functions (`instr`, `length`, `printf`, etc)
+* User defined functions (UDF)
+* User defined aggregation functions (UDAF)
+* User defined serialization formats (SerDe's)
+* Joins
+ * `JOIN`
+ * `{LEFT|RIGHT|FULL} OUTER JOIN`
+ * `LEFT SEMI JOIN`
+ * `CROSS JOIN`
+* Unions
+* Sub queries
+ * `SELECT col FROM ( SELECT a + b AS col from t1) t2`
+* Sampling
+* Explain
+* Partitioned tables
+* All Hive DDL Functions, including:
+ * `CREATE TABLE`
+ * `CREATE TABLE AS SELECT`
+ * `ALTER TABLE`
+* Most Hive Data types, including:
+ * `TINYINT`
+ * `SMALLINT`
+ * `INT`
+ * `BIGINT`
+ * `BOOLEAN`
+ * `FLOAT`
+ * `DOUBLE`
+ * `STRING`
+ * `BINARY`
+ * `TIMESTAMP`
+ * `ARRAY<>`
+ * `MAP<>`
+ * `STRUCT<>`
+
+#### Unsupported Hive Functionality
+
+Below is a list of Hive features that we don't support yet. Most of these features are rarely used
+in Hive deployments.
+
+**Major Hive Features**
+
+* Tables with buckets: bucket is the hash partitioning within a Hive table partition. Spark SQL
+  doesn't support buckets yet.
+
+**Esoteric Hive Features**
+
+* Tables with partitions using different input formats: In Spark SQL, all table partitions need to
+  have the same input format.
+* Non-equi outer join: For the uncommon use case of using outer joins with non-equi join conditions
+  (e.g. condition "`key < 10`"), Spark SQL will output wrong result for the `NULL` tuple.
+* `UNIONTYPE`
+* Unique join
+* Single query multi insert
+* Column statistics collecting: Spark SQL does not piggyback scans to collect column statistics at
+  the moment.
+
+**Hive Input/Output Formats**
+
+* File format for CLI: For results showing back to the CLI, Spark SQL only supports TextOutputFormat.
+* Hadoop archive
+
+**Hive Optimizations**
+
+A handful of Hive optimizations are not yet included in Spark. Some of these (such as indexes) are
+not necessary due to Spark SQL's in-memory computational model. Others are slotted for future
+releases of Spark SQL.
+
+* Block level bitmap indexes and virtual columns (used to build indexes)
+* Automatically convert a join to map join: For joining a large table with multiple small tables,
+  Hive automatically converts the join into a map join. We are adding this auto conversion in the
+  next release.
+* Automatically determine the number of reducers for joins and groupbys: Currently in Spark SQL, you
+  need to control the degree of parallelism post-shuffle using "SET
+  spark.sql.shuffle.partitions=[num_tasks];". We are going to add auto-setting of parallelism in the
+  next release.
+* Meta-data only query: For queries that can be answered by using only meta data, Spark SQL still
+  launches tasks to compute the result.
+* Skew data flag: Spark SQL does not follow the skew data flags in Hive.
+* `STREAMTABLE` hint in join: Spark SQL does not follow the `STREAMTABLE` hint.
+* Merge multiple small files for query results: if the result output contains multiple small files,
+  Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS
+  metadata. Spark SQL does not support that.
+
+## Running the Spark SQL CLI
+
+The Spark SQL CLI is a convenient tool to run the Hive metastore service in local mode and execute
+queries input from command line. Note: the Spark SQL CLI cannot talk to the Thrift JDBC server.
+
+To start the Spark SQL CLI, run the following in the Spark directory:
+
+    ./bin/spark-sql
+
+Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
+You may run `./bin/spark-sql --help` for a complete list of all available
+options.
+
+# Cached tables
+
+Spark SQL can cache tables using an in-memory columnar format by calling `cacheTable("tableName")`.
+Then Spark SQL will scan only required columns and will automatically tune compression to minimize
+memory usage and GC pressure. You can call `uncacheTable("tableName")` to remove the table from memory.
+
+Note that if you just call `cache` rather than `cacheTable`, tables will _not_ be cached in
+in-memory columnar format. So we strongly recommend using `cacheTable` whenever you want to
+cache tables.
diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md
index a2dc3a8961dfc..1e045a3dd0ca9 100644
--- a/docs/streaming-custom-receivers.md
+++ b/docs/streaming-custom-receivers.md
@@ -4,7 +4,7 @@ title: Spark Streaming Custom Receivers
 ---
 
 Spark Streaming can receive streaming data from any arbitrary data source beyond
-the one's for which it has in-built support (that is, beyond Flume, Kafka, files, sockets, etc.).
+the one's for which it has in-built support (that is, beyond Flume, Kafka, Kinesis, files, sockets, etc.).
 This requires the developer to implement a *receiver* that is customized for receiving data from
 the concerned data source. This guide walks through the process of implementing a custom receiver
 and using it in a Spark Streaming application.
@@ -174,7 +174,7 @@ val words = lines.flatMap(_.split(" "))
 ...
 {% endhighlight %}
 
-The full source code is in the example [CustomReceiver.scala](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/streaming/examples/CustomReceiver.scala).
+The full source code is in the example [CustomReceiver.scala](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala).
 
 </div>
 <div data-lang="java" markdown="1">
diff --git a/docs/streaming-kinesis.md b/docs/streaming-kinesis.md
new file mode 100644
index 0000000000000..801c905c88df8
--- /dev/null
+++ b/docs/streaming-kinesis.md
@@ -0,0 +1,58 @@
+---
+layout: global
+title: Spark Streaming Kinesis Receiver
+---
+
+### Kinesis
+Build notes:
+<li>Spark supports a Kinesis Streaming Receiver which is not included in the default build due to licensing restrictions.</li>
+<li>_**Note that by embedding this library you will include [ASL](https://aws.amazon.com/asl/)-licensed code in your Spark package**_.</li>
+<li>The Spark Kinesis Streaming Receiver source code, examples, tests, and artifacts live in $SPARK_HOME/extras/kinesis-asl.</li>
+<li>To build with Kinesis, you must run the maven or sbt builds with -Pkinesis-asl`.</li>
+<li>Applications will need to link to the 'spark-streaming-kinesis-asl` artifact.</li>
+
+Kinesis examples notes:
+<li>To build the Kinesis examples, you must run the maven or sbt builds with -Pkinesis-asl`.</li>
+<li>These examples automatically determine the number of local threads and KinesisReceivers to spin up based on the number of shards for the stream.</li>
+<li>KinesisWordCountProducerASL will generate random data to put onto the Kinesis stream for testing.</li>
+<li>Checkpointing is disabled (no checkpoint dir is set).  The examples as written will not recover from a driver failure.</li>
+
+Deployment and runtime notes:
+<li>A single KinesisReceiver can process many shards of a stream.</li>
+<li>Each shard of a stream is processed by one or more KinesisReceiver's managed by the Kinesis Client Library (KCL) Worker.</li>
+<li>You never need more KinesisReceivers than the number of shards in your stream.</li>
+<li>You can horizontally scale the receiving by creating more KinesisReceiver/DStreams (up to the number of shards for a given stream)</li>
+<li>The Kinesis libraries must be present on all worker nodes, as they will need access to the Kinesis Client Library.</li>
+<li>This code uses the DefaultAWSCredentialsProviderChain and searches for credentials in the following order of precedence:<br/>
+    1) Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY<br/>
+    2) Java System Properties - aws.accessKeyId and aws.secretKey<br/>
+    3) Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs<br/>
+    4) Instance profile credentials - delivered through the Amazon EC2 metadata service<br/>
+</li>
+<li>You need to setup a Kinesis stream with 1 or more shards per the following:<br/>
+ http://docs.aws.amazon.com/kinesis/latest/dev/step-one-create-stream.html</li>
+<li>Valid Kinesis endpoint urls can be found here:  Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region</li>
+<li>When you first start up the KinesisReceiver, the Kinesis Client Library (KCL) needs ~30s to establish connectivity with the AWS Kinesis service,
+retrieve any checkpoint data, and negotiate with other KCL's reading from the same stream.</li>
+<li>Be careful when changing the app name.  Kinesis maintains a mapping table in DynamoDB based on this app name (http://docs.aws.amazon.com/kinesis/latest/dev/kinesis-record-processor-implementation-app.html#kinesis-record-processor-initialization).  
+Changing the app name could lead to Kinesis errors as only 1 logical application can process a stream.  In order to start fresh, 
+it's always best to delete the DynamoDB table that matches your app name.  This DynamoDB table lives in us-east-1 regardless of the Kinesis endpoint URL.</li>
+
+Failure recovery notes:
+<li>The combination of Spark Streaming and Kinesis creates 3 different checkpoints as follows:<br/>
+  1) RDD data checkpoint (Spark Streaming) - frequency is configurable with DStream.checkpoint(Duration)<br/>
+  2) RDD metadata checkpoint (Spark Streaming) - frequency is every DStream batch<br/>
+  3) Kinesis checkpointing (Kinesis) - frequency is controlled by the developer calling ICheckpointer.checkpoint() directly<br/>
+</li>
+<li>Checkpointing too frequently will cause excess load on the AWS checkpoint storage layer and may lead to AWS throttling</li>
+<li>Upon startup, a KinesisReceiver will begin processing records with sequence numbers greater than the last checkpoint sequence number recorded per shard.</li>
+<li>If no checkpoint info exists, the worker will start either from the oldest record available (InitialPositionInStream.TRIM_HORIZON)
+or from the tip/latest (InitialPostitionInStream.LATEST).  This is configurable.</li>
+<li>When pulling from the stream tip (InitialPositionInStream.LATEST), only new stream data will be picked up after the KinesisReceiver starts.</li>
+<li>InitialPositionInStream.LATEST could lead to missed records if data is added to the stream while no KinesisReceivers are running.</li>
+<li>In production, you'll want to switch to InitialPositionInStream.TRIM_HORIZON which will read up to 24 hours (Kinesis limit) of previous stream data
+depending on the checkpoint frequency.</li>
+<li>InitialPositionInStream.TRIM_HORIZON may lead to duplicate processing of records depending on the checkpoint frequency.</li>
+<li>Record processing should be idempotent when possible.</li>
+<li>Failed or latent KinesisReceivers will be detected and automatically shutdown/load-balanced by the KCL.</li>
+<li>If possible, explicitly shutdown the worker if a failure occurs in order to trigger the final checkpoint.</li>
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 90a0eef60c200..9f331ed50d2a4 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -9,7 +9,7 @@ title: Spark Streaming Programming Guide
 # Overview
 Spark Streaming is an extension of the core Spark API that allows enables high-throughput,
 fault-tolerant stream processing of live data streams. Data can be ingested from many sources
-like Kafka, Flume, Twitter, ZeroMQ or plain old TCP sockets and be processed using complex
+like Kafka, Flume, Twitter, ZeroMQ, Kinesis or plain old TCP sockets and be processed using complex
 algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`.
 Finally, processed data can be pushed out to filesystems, databases,
 and live dashboards. In fact, you can apply Spark's in-built
@@ -38,7 +38,7 @@ stream of results in batches.
 
 Spark Streaming provides a high-level abstraction called *discretized stream* or *DStream*,
 which represents a continuous stream of data. DStreams can be created either from input data
-stream from sources such as Kafka and Flume, or by applying high-level
+stream from sources such as Kafka, Flume, and Kinesis, or by applying high-level
 operations on other DStreams. Internally, a DStream is represented as a sequence of
 [RDDs](api/scala/index.html#org.apache.spark.rdd.RDD).
 
@@ -313,7 +313,7 @@ To write your own Spark Streaming program, you will have to add the following de
     artifactId = spark-streaming_{{site.SCALA_BINARY_VERSION}}
     version = {{site.SPARK_VERSION}}
 
-For ingesting data from sources like Kafka and Flume that are not present in the Spark
+For ingesting data from sources like Kafka, Flume, and Kinesis that are not present in the Spark
 Streaming core
  API, you will have to add the corresponding
 artifact `spark-streaming-xyz_{{site.SCALA_BINARY_VERSION}}` to the dependencies. For example,
@@ -327,6 +327,7 @@ some of the common ones are as follows.
 <tr><td> Twitter </td><td> spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> ZeroMQ </td><td> spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> MQTT </td><td> spark-streaming-mqtt_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> Kinesis<br/>(built separately)</td><td> kinesis-asl_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> </td><td></td></tr>
 </table>
 
@@ -442,7 +443,7 @@ see the API documentations of the relevant functions in
 Scala and [JavaStreamingContext](api/scala/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext)
  for Java.
 
-Additional functionality for creating DStreams from sources such as Kafka, Flume, and Twitter
+Additional functionality for creating DStreams from sources such as Kafka, Flume, Kinesis, and Twitter
 can be imported by adding the right dependencies as explained in an
 [earlier](#linking) section. To take the
 case of Kafka, after adding the artifact `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` to the
@@ -467,6 +468,9 @@ For more details on these additional sources, see the corresponding [API documen
 Furthermore, you can also implement your own custom receiver for your sources. See the
 [Custom Receiver Guide](streaming-custom-receivers.html).
 
+### Kinesis
+[Kinesis](streaming-kinesis.html)
+
 ## Operations
 There are two kinds of DStream operations - _transformations_ and _output operations_. Similar to
 RDD transformations, DStream transformations operate on one or more DStreams to create new DStreams
@@ -939,7 +943,7 @@ Receiving multiple data streams can therefore be achieved by creating multiple i
 and configuring them to receive different partitions of the data stream from the source(s).
 For example, a single Kafka input stream receiving two topics of data can be split into two
 Kafka input streams, each receiving only one topic. This would run two receivers on two workers,
-thus allowing data to received in parallel, and increasing overall throughput.
+thus allowing data to be received in parallel, and increasing overall throughput.
 
 Another parameter that should be considered is the receiver's blocking interval. For most receivers,
 the received data is coalesced together into large blocks of data before storing inside Spark's memory.
@@ -980,7 +984,7 @@ If the number of tasks launched per second is high (say, 50 or more per second),
 of sending out tasks to the slaves maybe significant and will make it hard to achieve sub-second
 latencies. The overhead can be reduced by the following changes:
 
-* **Task Serialization**: Using Kryo serialization for serializing tasks can reduced the task
+* **Task Serialization**: Using Kryo serialization for serializing tasks can reduce the task
   sizes, and therefore reduce the time taken to send them to the slaves.
 
 * **Execution mode**: Running Spark in Standalone mode or coarse-grained Mesos mode leads to
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index e05883072bfa8..45b70b1a5457a 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -33,6 +33,7 @@ dependencies, and can support different cluster managers and deploy modes that S
   --class <main-class>
   --master <master-url> \
   --deploy-mode <deploy-mode> \
+  --conf <key>=<value> \
   ... # other options
   <application-jar> \
   [application-arguments]
@@ -43,6 +44,7 @@ Some of the commonly used options are:
 * `--class`: The entry point for your application (e.g. `org.apache.spark.examples.SparkPi`)
 * `--master`: The [master URL](#master-urls) for the cluster (e.g. `spark://23.195.26.187:7077`)
 * `--deploy-mode`: Whether to deploy your driver on the worker nodes (`cluster`) or locally as an external client (`client`) (default: `client`)*
+* `--conf`: Arbitrary Spark configuration property in key=value format. For values that contain spaces wrap "key=value" in quotes (as shown).
 * `application-jar`: Path to a bundled jar including your application and all dependencies. The URL must be globally visible inside of your cluster, for instance, an `hdfs://` path or a `file://` path that is present on all nodes.
 * `application-arguments`: Arguments passed to the main method of your main class, if any
 
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index e22d93bd31bc2..0c2f85a3868f4 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -135,6 +135,10 @@ def parse_args():
         "--master-opts", type="string", default="",
         help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
              "(e.g -Dspark.worker.timeout=180)")
+    parser.add_option(
+        "--user-data", type="string", default="",
+        help="Path to a user-data file (most AMI's interpret this as an initialization script)")
+
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -240,7 +244,10 @@ def get_spark_ami(opts):
         "r3.xlarge":   "hvm",
         "r3.2xlarge":  "hvm",
         "r3.4xlarge":  "hvm",
-        "r3.8xlarge":  "hvm"
+        "r3.8xlarge":  "hvm",
+        "t2.micro":    "hvm",
+        "t2.small":    "hvm",
+        "t2.medium":   "hvm"
     }
     if opts.instance_type in instance_types:
         instance_type = instance_types[opts.instance_type]
@@ -271,6 +278,12 @@ def launch_cluster(conn, opts, cluster_name):
     if opts.key_pair is None:
         print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
         sys.exit(1)
+
+    user_data_content = None
+    if opts.user_data:
+        with open(opts.user_data) as user_data_file:
+            user_data_content = user_data_file.read()
+
     print "Setting up security groups..."
     master_group = get_or_make_group(conn, cluster_name + "-master")
     slave_group = get_or_make_group(conn, cluster_name + "-slaves")
@@ -279,6 +292,7 @@ def launch_cluster(conn, opts, cluster_name):
         master_group.authorize(src_group=slave_group)
         master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
         master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
+        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
         master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
         master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
         master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
@@ -343,7 +357,8 @@ def launch_cluster(conn, opts, cluster_name):
                 key_name=opts.key_pair,
                 security_groups=[slave_group],
                 instance_type=opts.instance_type,
-                block_device_map=block_map)
+                block_device_map=block_map,
+                user_data=user_data_content)
             my_req_ids += [req.id for req in slave_reqs]
             i += 1
 
@@ -394,7 +409,8 @@ def launch_cluster(conn, opts, cluster_name):
                                       placement=zone,
                                       min_count=num_slaves_this_zone,
                                       max_count=num_slaves_this_zone,
-                                      block_device_map=block_map)
+                                      block_device_map=block_map,
+                                      user_data=user_data_content)
                 slave_nodes += slave_res.instances
                 print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                                 zone, slave_res.id)
@@ -427,11 +443,11 @@ def launch_cluster(conn, opts, cluster_name):
     for master in master_nodes:
         master.add_tag(
             key='Name',
-            value='spark-{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
+            value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
     for slave in slave_nodes:
         slave.add_tag(
             key='Name',
-            value='spark-{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
+            value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
 
     # Return all the instances
     return (master_nodes, slave_nodes)
@@ -698,6 +714,7 @@ def ssh(host, opts, command):
             time.sleep(30)
             tries = tries + 1
 
+
 # Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
 def _check_output(*popenargs, **kwargs):
     if 'stdout' in kwargs:
diff --git a/examples/pom.xml b/examples/pom.xml
index 4f6d7fdb87d47..8c4c128bb484d 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -27,10 +27,26 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-examples_2.10</artifactId>
+  <properties>
+    <sbt.project.name>examples</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Examples</name>
   <url>http://spark.apache.org/</url>
 
+  <profiles>
+    <profile>
+      <id>kinesis-asl</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-streaming-kinesis-asl_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+  
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 607df3eddd550..898297dc658ba 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -74,7 +74,7 @@ public Person call(String line) throws Exception {
 
     // Apply a schema to an RDD of Java Beans and register it as a table.
     JavaSchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
-    schemaPeople.registerAsTable("people");
+    schemaPeople.registerTempTable("people");
 
     // SQL can be run over RDDs that have been registered as tables.
     JavaSchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
@@ -100,7 +100,7 @@ public String call(Row row) {
     JavaSchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
 
     //Parquet files can also be registered as tables and then used in SQL statements.
-    parquetFile.registerAsTable("parquetFile");
+    parquetFile.registerTempTable("parquetFile");
     JavaSchemaRDD teenagers2 =
       sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
     teenagerNames = teenagers2.map(new Function<Row, String>() {
@@ -128,7 +128,7 @@ public String call(Row row) {
     //  |-- name: StringType
 
     // Register this JavaSchemaRDD as a table.
-    peopleFromJsonFile.registerAsTable("people");
+    peopleFromJsonFile.registerTempTable("people");
 
     // SQL statements can be run by using the sql methods provided by sqlCtx.
     JavaSchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
@@ -158,7 +158,7 @@ public String call(Row row) {
     //  |    |-- state: StringType
     //  |-- name: StringType
 
-    peopleFromJsonRDD.registerAsTable("people2");
+    peopleFromJsonRDD.registerTempTable("people2");
 
     JavaSchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
     List<String> nameAndCity = peopleWithCity.map(new Function<Row, String>() {
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index 1a7c4c51f48cd..c862650b0aa1d 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -16,6 +16,9 @@
 #
 
 """
+This is an example implementation of ALS for learning how to use Spark. Please refer to
+ALS in pyspark.mllib.recommendation for more conventional use.
+
 This example requires numpy (http://www.numpy.org/)
 """
 from os.path import realpath
@@ -49,9 +52,15 @@ def update(i, vec, mat, ratings):
 
 
 if __name__ == "__main__":
+
     """
     Usage: als [M] [U] [F] [iterations] [slices]"
     """
+
+    print >> sys.stderr, """WARN: This is a naive implementation of ALS and is given as an
+      example. Please use the ALS method found in pyspark.mllib.recommendation for more
+      conventional use."""
+
     sc = SparkContext(appName="PythonALS")
     M = int(sys.argv[1]) if len(sys.argv) > 1 else 100
     U = int(sys.argv[2]) if len(sys.argv) > 2 else 500
diff --git a/examples/src/main/python/cassandra_outputformat.py b/examples/src/main/python/cassandra_outputformat.py
new file mode 100644
index 0000000000000..1dfbf98604425
--- /dev/null
+++ b/examples/src/main/python/cassandra_outputformat.py
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+from pyspark import SparkContext
+
+"""
+Create data in Cassandra fist
+(following: https://wiki.apache.org/cassandra/GettingStarted)
+
+cqlsh> CREATE KEYSPACE test
+   ... WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
+cqlsh> use test;
+cqlsh:test> CREATE TABLE users (
+        ...   user_id int PRIMARY KEY,
+        ...   fname text,
+        ...   lname text
+        ... );
+
+> cassandra_outputformat <host> test users 1745 john smith
+> cassandra_outputformat <host> test users 1744 john doe
+> cassandra_outputformat <host> test users 1746 john smith
+
+cqlsh:test> SELECT * FROM users;
+
+ user_id | fname | lname
+---------+-------+-------
+    1745 |  john | smith
+    1744 |  john |   doe
+    1746 |  john | smith
+"""
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print >> sys.stderr, """
+        Usage: cassandra_outputformat <host> <keyspace> <cf> <user_id> <fname> <lname>
+
+        Run with example jar:
+        ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/cassandra_outputformat.py <args>
+        Assumes you have created the following table <cf> in Cassandra already,
+        running on <host>, in <keyspace>.
+
+        cqlsh:<keyspace>> CREATE TABLE <cf> (
+           ...   user_id int PRIMARY KEY,
+           ...   fname text,
+           ...   lname text
+           ... );
+        """
+        exit(-1)
+
+    host = sys.argv[1]
+    keyspace = sys.argv[2]
+    cf = sys.argv[3]
+    sc = SparkContext(appName="CassandraOutputFormat")
+
+    conf = {"cassandra.output.thrift.address":host,
+            "cassandra.output.thrift.port":"9160",
+            "cassandra.output.keyspace":keyspace,
+            "cassandra.output.partitioner.class":"Murmur3Partitioner",
+            "cassandra.output.cql":"UPDATE " + keyspace + "." + cf + " SET fname = ?, lname = ?",
+            "mapreduce.output.basename":cf,
+            "mapreduce.outputformat.class":"org.apache.cassandra.hadoop.cql3.CqlOutputFormat",
+            "mapreduce.job.output.key.class":"java.util.Map",
+            "mapreduce.job.output.value.class":"java.util.List"}
+    key = {"user_id" : int(sys.argv[4])}
+    sc.parallelize([(key, sys.argv[5:])]).saveAsNewAPIHadoopDataset(
+        conf=conf,
+        keyConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLKeyConverter",
+        valueConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLValueConverter")
diff --git a/examples/src/main/python/hbase_inputformat.py b/examples/src/main/python/hbase_inputformat.py
index 3289d9880a0f5..c9fa8e171c2a1 100644
--- a/examples/src/main/python/hbase_inputformat.py
+++ b/examples/src/main/python/hbase_inputformat.py
@@ -65,7 +65,8 @@
         "org.apache.hadoop.hbase.mapreduce.TableInputFormat",
         "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
         "org.apache.hadoop.hbase.client.Result",
-        valueConverter="org.apache.spark.examples.pythonconverters.HBaseConverter",
+        keyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter",
+        valueConverter="org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter",
         conf=conf)
     output = hbase_rdd.collect()
     for (k, v) in output:
diff --git a/examples/src/main/python/hbase_outputformat.py b/examples/src/main/python/hbase_outputformat.py
new file mode 100644
index 0000000000000..5e11548fd13f7
--- /dev/null
+++ b/examples/src/main/python/hbase_outputformat.py
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+from pyspark import SparkContext
+
+"""
+Create test table in HBase first:
+
+hbase(main):001:0> create 'test', 'f1'
+0 row(s) in 0.7840 seconds
+
+> hbase_outputformat <host> test row1 f1 q1 value1
+> hbase_outputformat <host> test row2 f1 q1 value2
+> hbase_outputformat <host> test row3 f1 q1 value3
+> hbase_outputformat <host> test row4 f1 q1 value4
+
+hbase(main):002:0> scan 'test'
+ROW                   COLUMN+CELL
+ row1                 column=f1:q1, timestamp=1405659615726, value=value1
+ row2                 column=f1:q1, timestamp=1405659626803, value=value2
+ row3                 column=f1:q1, timestamp=1405659640106, value=value3
+ row4                 column=f1:q1, timestamp=1405659650292, value=value4
+4 row(s) in 0.0780 seconds
+"""
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print >> sys.stderr, """
+        Usage: hbase_outputformat <host> <table> <row> <family> <qualifier> <value>
+
+        Run with example jar:
+        ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/hbase_outputformat.py <args>
+        Assumes you have created <table> with column family <family> in HBase running on <host> already
+        """
+        exit(-1)
+
+    host = sys.argv[1]
+    table = sys.argv[2]
+    sc = SparkContext(appName="HBaseOutputFormat")
+
+    conf = {"hbase.zookeeper.quorum": host,
+            "hbase.mapred.outputtable": table,
+            "mapreduce.outputformat.class" : "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
+            "mapreduce.job.output.key.class" : "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
+            "mapreduce.job.output.value.class" : "org.apache.hadoop.io.Writable"}
+
+    sc.parallelize([sys.argv[3:]]).map(lambda x: (x[0], x)).saveAsNewAPIHadoopDataset(
+        conf=conf,
+        keyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter",
+        valueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter")
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index 988fc45baf3bc..036bdf4c4f999 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -45,9 +45,15 @@ def closestPoint(p, centers):
 
 
 if __name__ == "__main__":
+
     if len(sys.argv) != 4:
         print >> sys.stderr, "Usage: kmeans <file> <k> <convergeDist>"
         exit(-1)
+
+    print >> sys.stderr, """WARN: This is a naive implementation of KMeans Clustering and is given
+       as an example! Please refer to examples/src/main/python/mllib/kmeans.py for an example on
+       how to use MLlib's KMeans implementation."""
+
     sc = SparkContext(appName="PythonKMeans")
     lines = sc.textFile(sys.argv[1])
     data = lines.map(parseVector).cache()
diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
index 6c33deabfd6ea..8456b272f9c05 100755
--- a/examples/src/main/python/logistic_regression.py
+++ b/examples/src/main/python/logistic_regression.py
@@ -47,9 +47,15 @@ def readPointBatch(iterator):
     return [matrix]
 
 if __name__ == "__main__":
+
     if len(sys.argv) != 3:
         print >> sys.stderr, "Usage: logistic_regression <file> <iterations>"
         exit(-1)
+
+    print >> sys.stderr,  """WARN: This is a naive implementation of Logistic Regression and is
+      given as an example! Please refer to examples/src/main/python/mllib/logistic_regression.py
+      to see how MLlib's implementation is used."""
+
     sc = SparkContext(appName="PythonLR")
     points = sc.textFile(sys.argv[1]).mapPartitions(readPointBatch).cache()
     iterations = int(sys.argv[2])
diff --git a/examples/src/main/python/mllib/decision_tree_runner.py b/examples/src/main/python/mllib/decision_tree_runner.py
new file mode 100755
index 0000000000000..8efadb5223f56
--- /dev/null
+++ b/examples/src/main/python/mllib/decision_tree_runner.py
@@ -0,0 +1,133 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Decision tree classification and regression using MLlib.
+"""
+
+import numpy, os, sys
+
+from operator import add
+
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.tree import DecisionTree
+from pyspark.mllib.util import MLUtils
+
+
+def getAccuracy(dtModel, data):
+    """
+    Return accuracy of DecisionTreeModel on the given RDD[LabeledPoint].
+    """
+    seqOp = (lambda acc, x: acc + (x[0] == x[1]))
+    predictions = dtModel.predict(data.map(lambda x: x.features))
+    truth = data.map(lambda p: p.label)
+    trainCorrect = predictions.zip(truth).aggregate(0, seqOp, add)
+    if data.count() == 0:
+        return 0
+    return trainCorrect / (0.0 + data.count())
+
+
+def getMSE(dtModel, data):
+    """
+    Return mean squared error (MSE) of DecisionTreeModel on the given
+    RDD[LabeledPoint].
+    """
+    seqOp = (lambda acc, x: acc + numpy.square(x[0] - x[1]))
+    predictions = dtModel.predict(data.map(lambda x: x.features))
+    truth = data.map(lambda p: p.label)
+    trainMSE = predictions.zip(truth).aggregate(0, seqOp, add)
+    if data.count() == 0:
+        return 0
+    return trainMSE / (0.0 + data.count())
+
+
+def reindexClassLabels(data):
+    """
+    Re-index class labels in a dataset to the range {0,...,numClasses-1}.
+    If all labels in that range already appear at least once,
+     then the returned RDD is the same one (without a mapping).
+    Note: If a label simply does not appear in the data,
+          the index will not include it.
+          Be aware of this when reindexing subsampled data.
+    :param data: RDD of LabeledPoint where labels are integer values
+                 denoting labels for a classification problem.
+    :return: Pair (reindexedData, origToNewLabels) where
+             reindexedData is an RDD of LabeledPoint with labels in
+              the range {0,...,numClasses-1}, and
+             origToNewLabels is a dictionary mapping original labels
+              to new labels.
+    """
+    # classCounts: class --> # examples in class
+    classCounts = data.map(lambda x: x.label).countByValue()
+    numExamples = sum(classCounts.values())
+    sortedClasses = sorted(classCounts.keys())
+    numClasses = len(classCounts)
+    # origToNewLabels: class --> index in 0,...,numClasses-1
+    if (numClasses < 2):
+        print >> sys.stderr, \
+            "Dataset for classification should have at least 2 classes." + \
+            " The given dataset had only %d classes." % numClasses
+        exit(1)
+    origToNewLabels = dict([(sortedClasses[i], i) for i in range(0, numClasses)])
+
+    print "numClasses = %d" % numClasses
+    print "Per-class example fractions, counts:"
+    print "Class\tFrac\tCount"
+    for c in sortedClasses:
+        frac = classCounts[c] / (numExamples + 0.0)
+        print "%g\t%g\t%d" % (c, frac, classCounts[c])
+
+    if (sortedClasses[0] == 0 and sortedClasses[-1] == numClasses - 1):
+        return (data, origToNewLabels)
+    else:
+        reindexedData = \
+            data.map(lambda x: LabeledPoint(origToNewLabels[x.label], x.features))
+        return (reindexedData, origToNewLabels)
+
+
+def usage():
+    print >> sys.stderr, \
+        "Usage: decision_tree_runner [libsvm format data filepath]\n" + \
+        " Note: This only supports binary classification."
+    exit(1)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 2:
+        usage()
+    sc = SparkContext(appName="PythonDT")
+
+    # Load data.
+    dataPath = 'data/mllib/sample_libsvm_data.txt'
+    if len(sys.argv) == 2:
+        dataPath = sys.argv[1]
+    if not os.path.isfile(dataPath):
+        usage()
+    points = MLUtils.loadLibSVMFile(sc, dataPath)
+
+    # Re-index class labels if needed.
+    (reindexedData, origToNewLabels) = reindexClassLabels(points)
+
+    # Train a classifier.
+    model = DecisionTree.trainClassifier(reindexedData, numClasses=2)
+    # Print learned tree and stats.
+    print "Trained DecisionTree for classification:"
+    print "  Model numNodes: %d\n" % model.numNodes()
+    print "  Model depth: %d\n" % model.depth()
+    print "  Training accuracy: %g\n" % getAccuracy(model, reindexedData)
+    print model
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
index 6e0f7a4ee5a81..9d547ff77c984 100755
--- a/examples/src/main/python/mllib/logistic_regression.py
+++ b/examples/src/main/python/mllib/logistic_regression.py
@@ -30,8 +30,10 @@
 from pyspark.mllib.classification import LogisticRegressionWithSGD
 
 
-# Parse a line of text into an MLlib LabeledPoint object
 def parsePoint(line):
+    """
+    Parse a line of text into an MLlib LabeledPoint object.
+    """
     values = [float(s) for s in line.split(' ')]
     if values[0] == -1:   # Convert -1 labels to 0 for MLlib
         values[0] = 0
diff --git a/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala b/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
index 4893b017ed819..822673347bdce 100644
--- a/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
@@ -31,12 +31,12 @@ object HBaseTest {
     val conf = HBaseConfiguration.create()
     // Other options for configuring scan behavior are available. More information available at
     // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
-    conf.set(TableInputFormat.INPUT_TABLE, args(1))
+    conf.set(TableInputFormat.INPUT_TABLE, args(0))
 
     // Initialize hBase table if necessary
     val admin = new HBaseAdmin(conf)
-    if(!admin.isTableAvailable(args(1))) {
-      val tableDesc = new HTableDescriptor(args(1))
+    if (!admin.isTableAvailable(args(0))) {
+      val tableDesc = new HTableDescriptor(args(0))
       admin.createTable(tableDesc)
     }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala b/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
index 331de3ad1ef53..ed2b38e2ca6f8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
@@ -19,16 +19,22 @@ package org.apache.spark.examples
 
 import org.apache.spark._
 
+
 object HdfsTest {
+
+  /** Usage: HdfsTest [file] */
   def main(args: Array[String]) {
+    if (args.length < 1) {
+      System.err.println("Usage: HdfsTest <file>")
+      System.exit(1)
+    }
     val sparkConf = new SparkConf().setAppName("HdfsTest")
     val sc = new SparkContext(sparkConf)
-    val file = sc.textFile(args(1))
+    val file = sc.textFile(args(0))
     val mapped = file.map(s => s.length).cache()
     for (iter <- 1 to 10) {
       val start = System.currentTimeMillis()
       for (x <- mapped) { x + 2 }
-      //  println("Processing: " + x)
       val end = System.currentTimeMillis()
       println("Iteration " + iter + " took " + (end-start) + " ms")
     }
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
index 658f73d96a86a..1f576319b3ca8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
@@ -25,6 +25,9 @@ import cern.jet.math._
 
 /**
  * Alternating least squares matrix factorization.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.recommendation.ALS
  */
 object LocalALS {
   // Parameters set through command line arguments
@@ -107,7 +110,16 @@ object LocalALS {
     solved2D.viewColumn(0)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of ALS and is given as an example!
+        |Please use the ALS method found in org.apache.spark.mllib.recommendation
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     args match {
       case Array(m, u, f, iters) => {
         M = m.toInt
@@ -120,6 +132,9 @@ object LocalALS {
         System.exit(1)
       }
     }
+
+    showWarning()
+
     printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
 
     val R = generateR()
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
index 0ef3001ca4ccd..931faac5463c4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
@@ -21,6 +21,12 @@ import java.util.Random
 
 import breeze.linalg.{Vector, DenseVector}
 
+/**
+ * Logistic regression based classification.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
+ */
 object LocalFileLR {
   val D = 10   // Numer of dimensions
   val rand = new Random(42)
@@ -32,7 +38,18 @@ object LocalFileLR {
     DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
+    showWarning()
+
     val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
     val points = lines.map(parsePoint _)
     val ITERATIONS = args(1).toInt
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
index e33a1b336d163..17624c20cff3d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
@@ -28,6 +28,9 @@ import org.apache.spark.SparkContext._
 
 /**
  * K-means clustering.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.clustering.KMeans
  */
 object LocalKMeans {
   val N = 1000
@@ -61,7 +64,18 @@ object LocalKMeans {
     bestIndex
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
+        |Please use the KMeans method found in org.apache.spark.mllib.clustering
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
+    showWarning()
+
     val data = generateData
     var points = new HashSet[Vector[Double]]
     var kPoints = new HashMap[Int, Vector[Double]]
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
index 385b48089d572..2d75b9d2590f8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
@@ -23,6 +23,9 @@ import breeze.linalg.{Vector, DenseVector}
 
 /**
  * Logistic regression based classification.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
  */
 object LocalLR {
   val N = 10000  // Number of data points
@@ -42,9 +45,19 @@ object LocalLR {
     Array.tabulate(N)(generatePoint)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
-    val data = generateData
 
+    showWarning()
+
+    val data = generateData
     // Initialize w to a random value
     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
     println("Initial w: " + w)
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
index 5cbc966bf06ca..fde8ffeedf8b4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
@@ -27,6 +27,9 @@ import org.apache.spark._
 
 /**
  * Alternating least squares matrix factorization.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.recommendation.ALS
  */
 object SparkALS {
   // Parameters set through command line arguments
@@ -87,7 +90,16 @@ object SparkALS {
     solved2D.viewColumn(0)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of ALS and is given as an example!
+        |Please use the ALS method found in org.apache.spark.mllib.recommendation
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     var slices = 0
 
     val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)
@@ -103,7 +115,11 @@ object SparkALS {
         System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")
         System.exit(1)
     }
+
+    showWarning()
+
     printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
+
     val sparkConf = new SparkConf().setAppName("SparkALS")
     val sc = new SparkContext(sparkConf)
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
index 4906a696e90a7..d583cf421ed23 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
@@ -30,6 +30,9 @@ import org.apache.spark.scheduler.InputFormatInfo
 
 /**
  * Logistic regression based classification.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
  */
 object SparkHdfsLR {
   val D = 10   // Numer of dimensions
@@ -48,12 +51,23 @@ object SparkHdfsLR {
     DataPoint(new DenseVector(x), y)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     if (args.length < 2) {
       System.err.println("Usage: SparkHdfsLR <file> <iters>")
       System.exit(1)
     }
 
+    showWarning()
+
     val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
     val inputPath = args(0)
     val conf = SparkHadoopUtil.get.newConfiguration()
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
index 4d28e0aad6597..48e8d11cdf95b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.examples
 
-import java.util.Random
-
 import breeze.linalg.{Vector, DenseVector, squaredDistance}
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -26,17 +24,17 @@ import org.apache.spark.SparkContext._
 
 /**
  * K-means clustering.
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.clustering.KMeans
  */
 object SparkKMeans {
-  val R = 1000     // Scaling factor
-  val rand = new Random(42)
 
   def parseVector(line: String): Vector[Double] = {
     DenseVector(line.split(' ').map(_.toDouble))
   }
 
   def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
-    var index = 0
     var bestIndex = 0
     var closest = Double.PositiveInfinity
 
@@ -51,11 +49,23 @@ object SparkKMeans {
     bestIndex
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
+        |Please use the KMeans method found in org.apache.spark.mllib.clustering
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
     if (args.length < 3) {
       System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
       System.exit(1)
     }
+
+    showWarning()
+
     val sparkConf = new SparkConf().setAppName("SparkKMeans")
     val sc = new SparkContext(sparkConf)
     val lines = sc.textFile(args(0))
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
index 99ceb3089e9fe..fc23308fc4adf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
@@ -28,6 +28,9 @@ import org.apache.spark._
 /**
  * Logistic regression based classification.
  * Usage: SparkLR [slices]
+ *
+ * This is an example implementation for learning how to use Spark. For more conventional use,
+ * please refer to org.apache.spark.mllib.classification.LogisticRegression
  */
 object SparkLR {
   val N = 10000  // Number of data points
@@ -47,7 +50,18 @@ object SparkLR {
     Array.tabulate(N)(generatePoint)
   }
 
+  def showWarning() {
+    System.err.println(
+      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
+        |Please use the LogisticRegression method found in org.apache.spark.mllib.classification
+        |for more conventional use.
+      """.stripMargin)
+  }
+
   def main(args: Array[String]) {
+
+    showWarning()
+
     val sparkConf = new SparkConf().setAppName("SparkLR")
     val sc = new SparkContext(sparkConf)
     val numSlices = if (args.length > 0) args(0).toInt else 2
@@ -66,6 +80,7 @@ object SparkLR {
     }
 
     println("Final w: " + w)
+
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
index 40b36c779afd6..4c7e006da0618 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
@@ -31,8 +31,12 @@ import org.apache.spark.{SparkConf, SparkContext}
  */
 object SparkPageRank {
   def main(args: Array[String]) {
+    if (args.length < 1) {
+      System.err.println("Usage: SparkPageRank <file> <iter>")
+      System.exit(1)
+    }
     val sparkConf = new SparkConf().setAppName("PageRank")
-    var iters = args(1).toInt
+    val iters = if (args.length > 0) args(1).toInt else 10
     val ctx = new SparkContext(sparkConf)
     val lines = ctx.textFile(args(0), 1)
     val links = lines.map{ s =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index b3cc361154198..cf3d2cca81ff6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -21,7 +21,6 @@ import scopt.OptionParser
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{DecisionTree, impurity}
 import org.apache.spark.mllib.tree.configuration.{Algo, Strategy}
@@ -33,9 +32,12 @@ import org.apache.spark.rdd.RDD
 /**
  * An example runner for decision tree. Run with
  * {{{
- * ./bin/spark-example org.apache.spark.examples.mllib.DecisionTreeRunner [options]
+ * ./bin/run-example org.apache.spark.examples.mllib.DecisionTreeRunner [options]
  * }}}
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ *
+ * Note: This script treats all features as real-valued (not categorical).
+ *       To include categorical features, modify categoricalFeaturesInfo.
  */
 object DecisionTreeRunner {
 
@@ -48,10 +50,12 @@ object DecisionTreeRunner {
 
   case class Params(
       input: String = null,
+      dataFormat: String = "libsvm",
       algo: Algo = Classification,
-      maxDepth: Int = 5,
+      maxDepth: Int = 4,
       impurity: ImpurityType = Gini,
-      maxBins: Int = 100)
+      maxBins: Int = 100,
+      fracTest: Double = 0.2)
 
   def main(args: Array[String]) {
     val defaultParams = Params()
@@ -71,18 +75,28 @@ object DecisionTreeRunner {
       opt[Int]("maxBins")
         .text(s"max number of bins, default: ${defaultParams.maxBins}")
         .action((x, c) => c.copy(maxBins = x))
+      opt[Double]("fracTest")
+        .text(s"fraction of data to hold out for testing, default: ${defaultParams.fracTest}")
+        .action((x, c) => c.copy(fracTest = x))
+      opt[String]("<dataFormat>")
+        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
+        .action((x, c) => c.copy(dataFormat = x))
       arg[String]("<input>")
         .text("input paths to labeled examples in dense format (label,f0 f1 f2 ...)")
         .required()
         .action((x, c) => c.copy(input = x))
       checkConfig { params =>
-        if (params.algo == Classification &&
-            (params.impurity == Gini || params.impurity == Entropy)) {
-          success
-        } else if (params.algo == Regression && params.impurity == Variance) {
-          success
+        if (params.fracTest < 0 || params.fracTest > 1) {
+          failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1].")
         } else {
-          failure(s"Algo ${params.algo} is not compatible with impurity ${params.impurity}.")
+          if (params.algo == Classification &&
+            (params.impurity == Gini || params.impurity == Entropy)) {
+            success
+          } else if (params.algo == Regression && params.impurity == Variance) {
+            success
+          } else {
+            failure(s"Algo ${params.algo} is not compatible with impurity ${params.impurity}.")
+          }
         }
       }
     }
@@ -95,16 +109,57 @@ object DecisionTreeRunner {
   }
 
   def run(params: Params) {
+
     val conf = new SparkConf().setAppName("DecisionTreeRunner")
     val sc = new SparkContext(conf)
 
     // Load training data and cache it.
-    val examples = MLUtils.loadLabeledPoints(sc, params.input).cache()
+    val origExamples = params.dataFormat match {
+      case "dense" => MLUtils.loadLabeledPoints(sc, params.input).cache()
+      case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input).cache()
+    }
+    // For classification, re-index classes if needed.
+    val (examples, numClasses) = params.algo match {
+      case Classification => {
+        // classCounts: class --> # examples in class
+        val classCounts = origExamples.map(_.label).countByValue()
+        val sortedClasses = classCounts.keys.toList.sorted
+        val numClasses = classCounts.size
+        // classIndexMap: class --> index in 0,...,numClasses-1
+        val classIndexMap = {
+          if (classCounts.keySet != Set(0.0, 1.0)) {
+            sortedClasses.zipWithIndex.toMap
+          } else {
+            Map[Double, Int]()
+          }
+        }
+        val examples = {
+          if (classIndexMap.isEmpty) {
+            origExamples
+          } else {
+            origExamples.map(lp => LabeledPoint(classIndexMap(lp.label), lp.features))
+          }
+        }
+        val numExamples = examples.count()
+        println(s"numClasses = $numClasses.")
+        println(s"Per-class example fractions, counts:")
+        println(s"Class\tFrac\tCount")
+        sortedClasses.foreach { c =>
+          val frac = classCounts(c) / numExamples.toDouble
+          println(s"$c\t$frac\t${classCounts(c)}")
+        }
+        (examples, numClasses)
+      }
+      case Regression =>
+        (origExamples, 0)
+      case _ =>
+        throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+    }
 
-    val splits = examples.randomSplit(Array(0.8, 0.2))
+    // Split into training, test.
+    val splits = examples.randomSplit(Array(1.0 - params.fracTest, params.fracTest))
     val training = splits(0).cache()
     val test = splits(1).cache()
-
     val numTraining = training.count()
     val numTest = test.count()
 
@@ -118,17 +173,25 @@ object DecisionTreeRunner {
       case Variance => impurity.Variance
     }
 
-    val strategy = new Strategy(params.algo, impurityCalculator, params.maxDepth, params.maxBins)
+    val strategy
+      = new Strategy(
+          algo = params.algo,
+          impurity = impurityCalculator,
+          maxDepth = params.maxDepth,
+          maxBins = params.maxBins,
+          numClassesForClassification = numClasses)
     val model = DecisionTree.train(training, strategy)
 
+    println(model)
+
     if (params.algo == Classification) {
       val accuracy = accuracyScore(model, test)
-      println(s"Test accuracy = $accuracy.")
+      println(s"Test accuracy = $accuracy")
     }
 
     if (params.algo == Regression) {
       val mse = meanSquaredError(model, test)
-      println(s"Test mean squared error = $mse.")
+      println(s"Test mean squared error = $mse")
     }
 
     sc.stop()
@@ -139,12 +202,8 @@ object DecisionTreeRunner {
    */
   private def accuracyScore(
       model: DecisionTreeModel,
-      data: RDD[LabeledPoint],
-      threshold: Double = 0.5): Double = {
-    def predictedValue(features: Vector): Double = {
-      if (model.predict(features) < threshold) 0.0 else 1.0
-    }
-    val correctCount = data.filter(y => predictedValue(y.features) == y.label).count()
+      data: RDD[LabeledPoint]): Double = {
+    val correctCount = data.filter(y => model.predict(y.features) == y.label).count()
     val count = data.count()
     correctCount.toDouble / count
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
index 4811bb70e4b28..05b7d66f8dffd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -91,7 +91,7 @@ object LinearRegression extends App {
 
     Logger.getRootLogger.setLevel(Level.WARN)
 
-    val examples = MLUtils.loadLibSVMFile(sc, params.input, multiclass = true).cache()
+    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()
 
     val splits = examples.randomSplit(Array(0.8, 0.2))
     val training = splits(0).cache()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
index 28e201d279f41..98aaedb9d7dc9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.examples.mllib
 
+import scala.collection.mutable
+
 import com.esotericsoftware.kryo.Kryo
 import org.apache.log4j.{Level, Logger}
 import scopt.OptionParser
@@ -41,6 +43,7 @@ object MovieLensALS {
   class ALSRegistrator extends KryoRegistrator {
     override def registerClasses(kryo: Kryo) {
       kryo.register(classOf[Rating])
+      kryo.register(classOf[mutable.BitSet])
     }
   }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
index 537e68a0991aa..88acd9dbb0878 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
@@ -22,7 +22,7 @@ import scopt.OptionParser
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.classification.NaiveBayes
-import org.apache.spark.mllib.util.{MLUtils, MulticlassLabelParser}
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * An example naive Bayes app. Run with
@@ -76,7 +76,7 @@ object SparseNaiveBayes {
       if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions
 
     val examples =
-      MLUtils.loadLibSVMFile(sc, params.input, multiclass = true, params.numFeatures, minPartitions)
+      MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions)
     // Cache examples because it will be used in both training and evaluation.
     examples.cache()
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
new file mode 100644
index 0000000000000..1fd37edfa7427
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
+import org.apache.spark.SparkConf
+import org.apache.spark.streaming.{Seconds, StreamingContext}
+
+/**
+ * Train a linear regression model on one stream of data and make predictions
+ * on another stream, where the data streams arrive as text files
+ * into two different directories.
+ *
+ * The rows of the text files must be labeled data points in the form
+ * `(y,[x1,x2,x3,...,xn])`
+ * Where n is the number of features. n must be the same for train and test.
+ *
+ * Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>
+ *
+ * To run on your local machine using the two directories `trainingDir` and `testDir`,
+ * with updates every 5 seconds, and 2 features per data point, call:
+ *    $ bin/run-example \
+ *        org.apache.spark.examples.mllib.StreamingLinearRegression trainingDir testDir 5 2
+ *
+ * As you add text files to `trainingDir` the model will continuously update.
+ * Anytime you add text files to `testDir`, you'll see predictions from the current model.
+ *
+ */
+object StreamingLinearRegression {
+
+  def main(args: Array[String]) {
+
+    if (args.length != 4) {
+      System.err.println(
+        "Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")
+      System.exit(1)
+    }
+
+    val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
+    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
+
+    val trainingData = MLUtils.loadStreamingLabeledPoints(ssc, args(0))
+    val testData = MLUtils.loadStreamingLabeledPoints(ssc, args(1))
+
+    val model = new StreamingLinearRegressionWithSGD()
+      .setInitialWeights(Vectors.dense(Array.fill[Double](args(3).toInt)(0)))
+
+    model.trainOn(trainingData)
+    model.predictOn(testData).print()
+
+    ssc.start()
+    ssc.awaitTermination()
+
+  }
+
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala
index 29a65c7a5f295..83feb5703b908 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/CassandraConverters.scala
@@ -20,7 +20,7 @@ package org.apache.spark.examples.pythonconverters
 import org.apache.spark.api.python.Converter
 import java.nio.ByteBuffer
 import org.apache.cassandra.utils.ByteBufferUtil
-import collection.JavaConversions.{mapAsJavaMap, mapAsScalaMap}
+import collection.JavaConversions._
 
 
 /**
@@ -44,3 +44,25 @@ class CassandraCQLValueConverter extends Converter[Any, java.util.Map[String, St
     mapAsJavaMap(result.mapValues(bb => ByteBufferUtil.string(bb)))
   }
 }
+
+/**
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts a
+ * Map[String, Int] to Cassandra key
+ */
+class ToCassandraCQLKeyConverter extends Converter[Any, java.util.Map[String, ByteBuffer]] {
+  override def convert(obj: Any): java.util.Map[String, ByteBuffer] = {
+    val input = obj.asInstanceOf[java.util.Map[String, Int]]
+    mapAsJavaMap(input.mapValues(i => ByteBufferUtil.bytes(i)))
+  }
+}
+
+/**
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts a
+ * List[String] to Cassandra value
+ */
+class ToCassandraCQLValueConverter extends Converter[Any, java.util.List[ByteBuffer]] {
+  override def convert(obj: Any): java.util.List[ByteBuffer] = {
+    val input = obj.asInstanceOf[java.util.List[String]]
+    seqAsJavaList(input.map(s => ByteBufferUtil.bytes(s)))
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
new file mode 100644
index 0000000000000..273bee0a8b30f
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverters.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.pythonconverters
+
+import scala.collection.JavaConversions._
+
+import org.apache.spark.api.python.Converter
+import org.apache.hadoop.hbase.client.{Put, Result}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+
+/**
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts an
+ * HBase Result to a String
+ */
+class HBaseResultToStringConverter extends Converter[Any, String] {
+  override def convert(obj: Any): String = {
+    val result = obj.asInstanceOf[Result]
+    Bytes.toStringBinary(result.value())
+  }
+}
+
+/**
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts an
+ * ImmutableBytesWritable to a String
+ */
+class ImmutableBytesWritableToStringConverter extends Converter[Any, String] {
+  override def convert(obj: Any): String = {
+    val key = obj.asInstanceOf[ImmutableBytesWritable]
+    Bytes.toStringBinary(key.get())
+  }
+}
+
+/**
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts a
+ * String to an ImmutableBytesWritable
+ */
+class StringToImmutableBytesWritableConverter extends Converter[Any, ImmutableBytesWritable] {
+  override def convert(obj: Any): ImmutableBytesWritable = {
+    val bytes = Bytes.toBytes(obj.asInstanceOf[String])
+    new ImmutableBytesWritable(bytes)
+  }
+}
+
+/**
+ * Implementation of [[org.apache.spark.api.python.Converter]] that converts a
+ * list of Strings to HBase Put
+ */
+class StringListToPutConverter extends Converter[Any, Put] {
+  override def convert(obj: Any): Put = {
+    val output = obj.asInstanceOf[java.util.ArrayList[String]].map(Bytes.toBytes(_)).toArray
+    val put = new Put(output(0))
+    put.add(output(1), output(2), output(3))
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index 63db688bfb8c0..d56d64c564200 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -36,7 +36,7 @@ object RDDRelation {
     val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
     // Any RDD containing case classes can be registered as a table.  The schema of the table is
     // automatically inferred using scala reflection.
-    rdd.registerAsTable("records")
+    rdd.registerTempTable("records")
 
     // Once tables have been registered, you can run SQL queries over them.
     println("Result of SELECT *:")
@@ -66,7 +66,7 @@ object RDDRelation {
     parquetFile.where('key === 1).select('value as 'a).collect().foreach(println)
 
     // These files can also be registered as tables.
-    parquetFile.registerAsTable("parquetFile")
+    parquetFile.registerTempTable("parquetFile")
     sql("SELECT * FROM parquetFile").collect().foreach(println)
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index b262fabbe0e0d..3423fac0ad303 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -19,7 +19,7 @@ package org.apache.spark.examples.sql.hive
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql._
-import org.apache.spark.sql.hive.LocalHiveContext
+import org.apache.spark.sql.hive.HiveContext
 
 object HiveFromSpark {
   case class Record(key: Int, value: String)
@@ -31,23 +31,23 @@ object HiveFromSpark {
     // A local hive context creates an instance of the Hive Metastore in process, storing the
     // the warehouse data in the current directory.  This location can be overridden by
     // specifying a second parameter to the constructor.
-    val hiveContext = new LocalHiveContext(sc)
+    val hiveContext = new HiveContext(sc)
     import hiveContext._
 
-    hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-    hql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src")
+    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    sql("LOAD DATA LOCAL INPATH 'src/main/resources/kv1.txt' INTO TABLE src")
 
     // Queries are expressed in HiveQL
     println("Result of 'SELECT *': ")
-    hql("SELECT * FROM src").collect.foreach(println)
+    sql("SELECT * FROM src").collect.foreach(println)
 
     // Aggregation queries are also supported.
-    val count = hql("SELECT COUNT(*) FROM src").collect().head.getInt(0)
+    val count = sql("SELECT COUNT(*) FROM src").collect().head.getLong(0)
     println(s"COUNT(*): $count")
 
     // The results of SQL queries are themselves RDDs and support all normal RDD functions.  The
     // items in the RDD are of type Row, which allows you to access each column by ordinal.
-    val rddFromSql = hql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")
+    val rddFromSql = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")
 
     println("Result of RDD.map:")
     val rddAsStrings = rddFromSql.map {
@@ -56,10 +56,10 @@ object HiveFromSpark {
 
     // You can also register RDDs as temporary tables within a HiveContext.
     val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
-    rdd.registerAsTable("records")
+    rdd.registerTempTable("records")
 
     // Queries can then join RDD data with data stored in Hive.
     println("Result of SELECT *:")
-    hql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println)
+    sql("SELECT * FROM records r JOIN src s ON r.key = s.key").collect().foreach(println)
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
new file mode 100644
index 0000000000000..1cc8c8d5c23b6
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming
+
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.flume._
+import org.apache.spark.util.IntParam
+import java.net.InetSocketAddress
+
+/**
+ *  Produces a count of events received from Flume.
+ *
+ *  This should be used in conjunction with the Spark Sink running in a Flume agent. See
+ *  the Spark Streaming programming guide for more details.
+ *
+ *  Usage: FlumePollingEventCount <host> <port>
+ *    `host` is the host on which the Spark Sink is running.
+ *    `port` is the port at which the Spark Sink is listening.
+ *
+ *  To run this example:
+ *    `$ bin/run-example org.apache.spark.examples.streaming.FlumePollingEventCount [host] [port] `
+ */
+object FlumePollingEventCount {
+  def main(args: Array[String]) {
+    if (args.length < 2) {
+      System.err.println(
+        "Usage: FlumePollingEventCount <host> <port>")
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    val Array(host, IntParam(port)) = args
+
+    val batchInterval = Milliseconds(2000)
+
+    // Create the context and set the batch size
+    val sparkConf = new SparkConf().setAppName("FlumePollingEventCount")
+    val ssc = new StreamingContext(sparkConf, batchInterval)
+
+    // Create a flume stream that polls the Spark Sink running in a Flume agent
+    val stream = FlumeUtils.createPollingStream(ssc, host, port)
+
+    // Print out the count of events received from this server in each batch
+    stream.count().map(cnt => "Received " + cnt + " flume events." ).print()
+
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
new file mode 100644
index 0000000000000..d0bf1cf1ea796
--- /dev/null
+++ b/external/flume-sink/pom.xml
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>1.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-streaming-flume-sink_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-flume-sink</sbt.project.name>
+  </properties>
+
+  <packaging>jar</packaging>
+  <name>Spark Project External Flume Sink</name>
+  <url>http://spark.apache.org/</url>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.flume</groupId>
+      <artifactId>flume-ng-sdk</artifactId>
+      <version>1.4.0</version>
+      <exclusions>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>libthrift</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.flume</groupId>
+      <artifactId>flume-ng-core</artifactId>
+      <version>1.4.0</version>
+      <exclusions>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>netty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.thrift</groupId>
+          <artifactId>libthrift</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.avro</groupId>
+        <artifactId>avro-maven-plugin</artifactId>
+        <version>1.7.3</version>
+        <configuration>
+          <!-- Generate the output in the same directory as the sbt-avro-plugin -->
+          <outputDirectory>${project.basedir}/target/scala-${scala.binary.version}/src_managed/main/compiled_avro</outputDirectory>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>idl-protocol</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/external/flume-sink/src/main/avro/sparkflume.avdl b/external/flume-sink/src/main/avro/sparkflume.avdl
new file mode 100644
index 0000000000000..8806e863ac7c6
--- /dev/null
+++ b/external/flume-sink/src/main/avro/sparkflume.avdl
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+@namespace("org.apache.spark.streaming.flume.sink")
+
+protocol SparkFlumeProtocol {
+
+  record SparkSinkEvent {
+    map<string> headers;
+    bytes body;
+  }
+
+  record EventBatch {
+    string errorMsg = ""; // If this is empty it is a valid message, else it represents an error
+    string sequenceNumber;
+    array<SparkSinkEvent> events;
+  }
+
+  EventBatch getEventBatch (int n);
+
+  void ack (string sequenceNumber);
+
+  void nack (string sequenceNumber);
+}
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala
new file mode 100644
index 0000000000000..17cbc6707b5ea
--- /dev/null
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume.sink
+
+import org.slf4j.{Logger, LoggerFactory}
+
+/**
+ * Copy of the org.apache.spark.Logging for being used in the Spark Sink.
+ * The org.apache.spark.Logging is not used so that all of Spark is not brought
+ * in as a dependency.
+ */
+private[sink] trait Logging {
+  // Make the log field transient so that objects with Logging can
+  // be serialized and used on another machine
+  @transient private var log_ : Logger = null
+
+  // Method to get or create the logger for this object
+  protected def log: Logger = {
+    if (log_ == null) {
+      initializeIfNecessary()
+      var className = this.getClass.getName
+      // Ignore trailing $'s in the class names for Scala objects
+      if (className.endsWith("$")) {
+        className = className.substring(0, className.length - 1)
+      }
+      log_ = LoggerFactory.getLogger(className)
+    }
+    log_
+  }
+
+  // Log methods that take only a String
+  protected def logInfo(msg: => String) {
+    if (log.isInfoEnabled) log.info(msg)
+  }
+
+  protected def logDebug(msg: => String) {
+    if (log.isDebugEnabled) log.debug(msg)
+  }
+
+  protected def logTrace(msg: => String) {
+    if (log.isTraceEnabled) log.trace(msg)
+  }
+
+  protected def logWarning(msg: => String) {
+    if (log.isWarnEnabled) log.warn(msg)
+  }
+
+  protected def logError(msg: => String) {
+    if (log.isErrorEnabled) log.error(msg)
+  }
+
+  // Log methods that take Throwables (Exceptions/Errors) too
+  protected def logInfo(msg: => String, throwable: Throwable) {
+    if (log.isInfoEnabled) log.info(msg, throwable)
+  }
+
+  protected def logDebug(msg: => String, throwable: Throwable) {
+    if (log.isDebugEnabled) log.debug(msg, throwable)
+  }
+
+  protected def logTrace(msg: => String, throwable: Throwable) {
+    if (log.isTraceEnabled) log.trace(msg, throwable)
+  }
+
+  protected def logWarning(msg: => String, throwable: Throwable) {
+    if (log.isWarnEnabled) log.warn(msg, throwable)
+  }
+
+  protected def logError(msg: => String, throwable: Throwable) {
+    if (log.isErrorEnabled) log.error(msg, throwable)
+  }
+
+  protected def isTraceEnabled(): Boolean = {
+    log.isTraceEnabled
+  }
+
+  private def initializeIfNecessary() {
+    if (!Logging.initialized) {
+      Logging.initLock.synchronized {
+        if (!Logging.initialized) {
+          initializeLogging()
+        }
+      }
+    }
+  }
+
+  private def initializeLogging() {
+    Logging.initialized = true
+
+    // Force a call into slf4j to initialize it. Avoids this happening from mutliple threads
+    // and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html
+    log
+  }
+}
+
+private[sink] object Logging {
+  @volatile private var initialized = false
+  val initLock = new Object()
+  try {
+    // We use reflection here to handle the case where users remove the
+    // slf4j-to-jul bridge order to route their logs to JUL.
+    val bridgeClass = Class.forName("org.slf4j.bridge.SLF4JBridgeHandler")
+    bridgeClass.getMethod("removeHandlersForRootLogger").invoke(null)
+    val installed = bridgeClass.getMethod("isInstalled").invoke(null).asInstanceOf[Boolean]
+    if (!installed) {
+      bridgeClass.getMethod("install").invoke(null)
+    }
+  } catch {
+    case e: ClassNotFoundException => // can't log anything yet so just fail silently
+  }
+}
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
new file mode 100644
index 0000000000000..7da8eb3e35912
--- /dev/null
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkAvroCallbackHandler.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume.sink
+
+import java.util.concurrent.{ConcurrentHashMap, Executors}
+import java.util.concurrent.atomic.AtomicLong
+
+import org.apache.flume.Channel
+import org.apache.commons.lang.RandomStringUtils
+import com.google.common.util.concurrent.ThreadFactoryBuilder
+
+/**
+ * Class that implements the SparkFlumeProtocol, that is used by the Avro Netty Server to process
+ * requests. Each getEvents, ack and nack call is forwarded to an instance of this class.
+ * @param threads Number of threads to use to process requests.
+ * @param channel The channel that the sink pulls events from
+ * @param transactionTimeout Timeout in millis after which the transaction if not acked by Spark
+ *                           is rolled back.
+ */
+// Flume forces transactions to be thread-local. So each transaction *must* be committed, or
+// rolled back from the thread it was originally created in. So each getEvents call from Spark
+// creates a TransactionProcessor which runs in a new thread, in which the transaction is created
+// and events are pulled off the channel. Once the events are sent to spark,
+// that thread is blocked and the TransactionProcessor is saved in a map,
+// until an ACK or NACK comes back or the transaction times out (after the specified timeout).
+// When the response comes or a timeout is hit, the TransactionProcessor is retrieved and then
+// unblocked, at which point the transaction is committed or rolled back.
+
+private[flume] class SparkAvroCallbackHandler(val threads: Int, val channel: Channel,
+  val transactionTimeout: Int, val backOffInterval: Int) extends SparkFlumeProtocol with Logging {
+  val transactionExecutorOpt = Option(Executors.newFixedThreadPool(threads,
+    new ThreadFactoryBuilder().setDaemon(true)
+      .setNameFormat("Spark Sink Processor Thread - %d").build()))
+  private val processorMap = new ConcurrentHashMap[CharSequence, TransactionProcessor]()
+  // This sink will not persist sequence numbers and reuses them if it gets restarted.
+  // So it is possible to commit a transaction which may have been meant for the sink before the
+  // restart.
+  // Since the new txn may not have the same sequence number we must guard against accidentally
+  // committing a new transaction. To reduce the probability of that happening a random string is
+  // prepended to the sequence number. Does not change for life of sink
+  private val seqBase = RandomStringUtils.randomAlphanumeric(8)
+  private val seqCounter = new AtomicLong(0)
+
+  /**
+   * Returns a bunch of events to Spark over Avro RPC.
+   * @param n Maximum number of events to return in a batch
+   * @return [[EventBatch]] instance that has a sequence number and an array of at most n events
+   */
+  override def getEventBatch(n: Int): EventBatch = {
+    logDebug("Got getEventBatch call from Spark.")
+    val sequenceNumber = seqBase + seqCounter.incrementAndGet()
+    val processor = new TransactionProcessor(channel, sequenceNumber,
+      n, transactionTimeout, backOffInterval, this)
+    transactionExecutorOpt.foreach(executor => {
+      executor.submit(processor)
+    })
+    // Wait until a batch is available - will be an error if error message is non-empty
+    val batch = processor.getEventBatch
+    if (!SparkSinkUtils.isErrorBatch(batch)) {
+      processorMap.put(sequenceNumber.toString, processor)
+      logDebug("Sending event batch with sequence number: " + sequenceNumber)
+    }
+    batch
+  }
+
+  /**
+   * Called by Spark to indicate successful commit of a batch
+   * @param sequenceNumber The sequence number of the event batch that was successful
+   */
+  override def ack(sequenceNumber: CharSequence): Void = {
+    logDebug("Received Ack for batch with sequence number: " + sequenceNumber)
+    completeTransaction(sequenceNumber, success = true)
+    null
+  }
+
+  /**
+   * Called by Spark to indicate failed commit of a batch
+   * @param sequenceNumber The sequence number of the event batch that failed
+   * @return
+   */
+  override def nack(sequenceNumber: CharSequence): Void = {
+    completeTransaction(sequenceNumber, success = false)
+    logInfo("Spark failed to commit transaction. Will reattempt events.")
+    null
+  }
+
+  /**
+   * Helper method to commit or rollback a transaction.
+   * @param sequenceNumber The sequence number of the batch that was completed
+   * @param success Whether the batch was successful or not.
+   */
+  private def completeTransaction(sequenceNumber: CharSequence, success: Boolean) {
+    Option(removeAndGetProcessor(sequenceNumber)).foreach(processor => {
+      processor.batchProcessed(success)
+    })
+  }
+
+  /**
+   * Helper method to remove the TxnProcessor for a Sequence Number. Can be used to avoid a leak.
+   * @param sequenceNumber
+   * @return The transaction processor for the corresponding batch. Note that this instance is no
+   *         longer tracked and the caller is responsible for that txn processor.
+   */
+  private[sink] def removeAndGetProcessor(sequenceNumber: CharSequence): TransactionProcessor = {
+    processorMap.remove(sequenceNumber.toString) // The toString is required!
+  }
+
+  /**
+   * Shuts down the executor used to process transactions.
+   */
+  def shutdown() {
+    logInfo("Shutting down Spark Avro Callback Handler")
+    transactionExecutorOpt.foreach(executor => {
+      executor.shutdownNow()
+    })
+  }
+}
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
new file mode 100644
index 0000000000000..7b735133e3d14
--- /dev/null
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume.sink
+
+import java.net.InetSocketAddress
+import java.util.concurrent._
+
+import org.apache.avro.ipc.NettyServer
+import org.apache.avro.ipc.specific.SpecificResponder
+import org.apache.flume.Context
+import org.apache.flume.Sink.Status
+import org.apache.flume.conf.{Configurable, ConfigurationException}
+import org.apache.flume.sink.AbstractSink
+
+/**
+ * A sink that uses Avro RPC to run a server that can be polled by Spark's
+ * FlumePollingInputDStream. This sink has the following configuration parameters:
+ *
+ * hostname - The hostname to bind to. Default: 0.0.0.0
+ * port - The port to bind to. (No default - mandatory)
+ * timeout - Time in seconds after which a transaction is rolled back,
+ * if an ACK is not received from Spark within that time
+ * threads - Number of threads to use to receive requests from Spark (Default: 10)
+ *
+ * This sink is unlike other Flume sinks in the sense that it does not push data,
+ * instead the process method in this sink simply blocks the SinkRunner the first time it is
+ * called. This sink starts up an Avro IPC server that uses the SparkFlumeProtocol.
+ *
+ * Each time a getEventBatch call comes, creates a transaction and reads events
+ * from the channel. When enough events are read, the events are sent to the Spark receiver and
+ * the thread itself is blocked and a reference to it saved off.
+ *
+ * When the ack for that batch is received,
+ * the thread which created the transaction is is retrieved and it commits the transaction with the
+ * channel from the same thread it was originally created in (since Flume transactions are
+ * thread local). If a nack is received instead, the sink rolls back the transaction. If no ack
+ * is received within the specified timeout, the transaction is rolled back too. If an ack comes
+ * after that, it is simply ignored and the events get re-sent.
+ *
+ */
+
+private[flume]
+class SparkSink extends AbstractSink with Logging with Configurable {
+
+  // Size of the pool to use for holding transaction processors.
+  private var poolSize: Integer = SparkSinkConfig.DEFAULT_THREADS
+
+  // Timeout for each transaction. If spark does not respond in this much time,
+  // rollback the transaction
+  private var transactionTimeout = SparkSinkConfig.DEFAULT_TRANSACTION_TIMEOUT
+
+  // Address info to bind on
+  private var hostname: String = SparkSinkConfig.DEFAULT_HOSTNAME
+  private var port: Int = 0
+
+  private var backOffInterval: Int = 200
+
+  // Handle to the server
+  private var serverOpt: Option[NettyServer] = None
+
+  // The handler that handles the callback from Avro
+  private var handler: Option[SparkAvroCallbackHandler] = None
+
+  // Latch that blocks off the Flume framework from wasting 1 thread.
+  private val blockingLatch = new CountDownLatch(1)
+
+  override def start() {
+    logInfo("Starting Spark Sink: " + getName + " on port: " + port + " and interface: " +
+      hostname + " with " + "pool size: " + poolSize + " and transaction timeout: " +
+      transactionTimeout + ".")
+    handler = Option(new SparkAvroCallbackHandler(poolSize, getChannel, transactionTimeout,
+      backOffInterval))
+    val responder = new SpecificResponder(classOf[SparkFlumeProtocol], handler.get)
+    // Using the constructor that takes specific thread-pools requires bringing in netty
+    // dependencies which are being excluded in the build. In practice,
+    // Netty dependencies are already available on the JVM as Flume would have pulled them in.
+    serverOpt = Option(new NettyServer(responder, new InetSocketAddress(hostname, port)))
+    serverOpt.foreach(server => {
+      logInfo("Starting Avro server for sink: " + getName)
+      server.start()
+    })
+    super.start()
+  }
+
+  override def stop() {
+    logInfo("Stopping Spark Sink: " + getName)
+    handler.foreach(callbackHandler => {
+      callbackHandler.shutdown()
+    })
+    serverOpt.foreach(server => {
+      logInfo("Stopping Avro Server for sink: " + getName)
+      server.close()
+      server.join()
+    })
+    blockingLatch.countDown()
+    super.stop()
+  }
+
+  override def configure(ctx: Context) {
+    import SparkSinkConfig._
+    hostname = ctx.getString(CONF_HOSTNAME, DEFAULT_HOSTNAME)
+    port = Option(ctx.getInteger(CONF_PORT)).
+      getOrElse(throw new ConfigurationException("The port to bind to must be specified"))
+    poolSize = ctx.getInteger(THREADS, DEFAULT_THREADS)
+    transactionTimeout = ctx.getInteger(CONF_TRANSACTION_TIMEOUT, DEFAULT_TRANSACTION_TIMEOUT)
+    backOffInterval = ctx.getInteger(CONF_BACKOFF_INTERVAL, DEFAULT_BACKOFF_INTERVAL)
+    logInfo("Configured Spark Sink with hostname: " + hostname + ", port: " + port + ", " +
+      "poolSize: " + poolSize + ", transactionTimeout: " + transactionTimeout + ", " +
+      "backoffInterval: " + backOffInterval)
+  }
+
+  override def process(): Status = {
+    // This method is called in a loop by the Flume framework - block it until the sink is
+    // stopped to save CPU resources. The sink runner will interrupt this thread when the sink is
+    // being shut down.
+    logInfo("Blocking Sink Runner, sink will continue to run..")
+    blockingLatch.await()
+    Status.BACKOFF
+  }
+}
+
+/**
+ * Configuration parameters and their defaults.
+ */
+private[flume]
+object SparkSinkConfig {
+  val THREADS = "threads"
+  val DEFAULT_THREADS = 10
+
+  val CONF_TRANSACTION_TIMEOUT = "timeout"
+  val DEFAULT_TRANSACTION_TIMEOUT = 60
+
+  val CONF_HOSTNAME = "hostname"
+  val DEFAULT_HOSTNAME = "0.0.0.0"
+
+  val CONF_PORT = "port"
+
+  val CONF_BACKOFF_INTERVAL = "backoffInterval"
+  val DEFAULT_BACKOFF_INTERVAL = 200
+}
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkUtils.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkUtils.scala
new file mode 100644
index 0000000000000..47c0e294d6b52
--- /dev/null
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSinkUtils.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume.sink
+
+private[flume] object SparkSinkUtils {
+  /**
+   * This method determines if this batch represents an error or not.
+   * @param batch - The batch to check
+   * @return - true if the batch represents an error
+   */
+  def isErrorBatch(batch: EventBatch): Boolean = {
+    !batch.getErrorMsg.toString.equals("") // If there is an error message, it is an error batch.
+  }
+}
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala
new file mode 100644
index 0000000000000..b9e3c786ebb3b
--- /dev/null
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/TransactionProcessor.scala
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume.sink
+
+import java.nio.ByteBuffer
+import java.util
+import java.util.concurrent.{Callable, CountDownLatch, TimeUnit}
+
+import scala.util.control.Breaks
+
+import org.apache.flume.{Transaction, Channel}
+
+// Flume forces transactions to be thread-local (horrible, I know!)
+// So the sink basically spawns a new thread to pull the events out within a transaction.
+// The thread fills in the event batch object that is set before the thread is scheduled.
+// After filling it in, the thread waits on a condition - which is released only
+// when the success message comes back for the specific sequence number for that event batch.
+/**
+ * This class represents a transaction on the Flume channel. This class runs a separate thread
+ * which owns the transaction. The thread is blocked until the success call for that transaction
+ * comes back with an ACK or NACK.
+ * @param channel The channel from which to pull events
+ * @param seqNum The sequence number to use for the transaction. Must be unique
+ * @param maxBatchSize The maximum number of events to process per batch
+ * @param transactionTimeout Time in seconds after which a transaction must be rolled back
+ *                           without waiting for an ACK from Spark
+ * @param parent The parent [[SparkAvroCallbackHandler]] instance, for reporting timeouts
+ */
+private class TransactionProcessor(val channel: Channel, val seqNum: String,
+  var maxBatchSize: Int, val transactionTimeout: Int, val backOffInterval: Int,
+  val parent: SparkAvroCallbackHandler) extends Callable[Void] with Logging {
+
+  // If a real batch is not returned, we always have to return an error batch.
+  @volatile private var eventBatch: EventBatch = new EventBatch("Unknown Error", "",
+    util.Collections.emptyList())
+
+  // Synchronization primitives
+  val batchGeneratedLatch = new CountDownLatch(1)
+  val batchAckLatch = new CountDownLatch(1)
+
+  // Sanity check to ensure we don't loop like crazy
+  val totalAttemptsToRemoveFromChannel = Int.MaxValue / 2
+
+  // OK to use volatile, since the change would only make this true (otherwise it will be
+  // changed to false - we never apply a negation operation to this) - which means the transaction
+  // succeeded.
+  @volatile private var batchSuccess = false
+
+  // The transaction that this processor would handle
+  var txOpt: Option[Transaction] = None
+
+  /**
+   * Get an event batch from the channel. This method will block until a batch of events is
+   * available from the channel. If no events are available after a large number of attempts of
+   * polling the channel, this method will return an [[EventBatch]] with a non-empty error message
+   *
+   * @return An [[EventBatch]] instance with sequence number set to seqNum, filled with a
+   *         maximum of maxBatchSize events
+   */
+  def getEventBatch: EventBatch = {
+    batchGeneratedLatch.await()
+    eventBatch
+  }
+
+  /**
+   * This method is to be called by the sink when it receives an ACK or NACK from Spark. This
+   * method is a no-op if it is called after transactionTimeout has expired since
+   * getEventBatch returned a batch of events.
+   * @param success True if an ACK was received and the transaction should be committed, else false.
+   */
+  def batchProcessed(success: Boolean) {
+    logDebug("Batch processed for sequence number: " + seqNum)
+    batchSuccess = success
+    batchAckLatch.countDown()
+  }
+
+  /**
+   * Populates events into the event batch. If the batch cannot be populated,
+   * this method will not set the events into the event batch, but it sets an error message.
+   */
+  private def populateEvents() {
+    try {
+      txOpt = Option(channel.getTransaction)
+      if(txOpt.isEmpty) {
+        eventBatch.setErrorMsg("Something went wrong. Channel was " +
+          "unable to create a transaction!")
+      }
+      txOpt.foreach(tx => {
+        tx.begin()
+        val events = new util.ArrayList[SparkSinkEvent](maxBatchSize)
+        val loop = new Breaks
+        var gotEventsInThisTxn = false
+        var loopCounter: Int = 0
+        loop.breakable {
+          while (events.size() < maxBatchSize
+            && loopCounter < totalAttemptsToRemoveFromChannel) {
+            loopCounter += 1
+            Option(channel.take()) match {
+              case Some(event) =>
+                events.add(new SparkSinkEvent(toCharSequenceMap(event.getHeaders),
+                  ByteBuffer.wrap(event.getBody)))
+                gotEventsInThisTxn = true
+              case None =>
+                if (!gotEventsInThisTxn) {
+                  logDebug("Sleeping for " + backOffInterval + " millis as no events were read in" +
+                    " the current transaction")
+                  TimeUnit.MILLISECONDS.sleep(backOffInterval)
+                } else {
+                  loop.break()
+                }
+            }
+          }
+        }
+        if (!gotEventsInThisTxn) {
+          val msg = "Tried several times, " +
+            "but did not get any events from the channel!"
+          logWarning(msg)
+          eventBatch.setErrorMsg(msg)
+        } else {
+          // At this point, the events are available, so fill them into the event batch
+          eventBatch = new EventBatch("",seqNum, events)
+        }
+      })
+    } catch {
+      case e: Exception =>
+        logWarning("Error while processing transaction.", e)
+        eventBatch.setErrorMsg(e.getMessage)
+        try {
+          txOpt.foreach(tx => {
+            rollbackAndClose(tx, close = true)
+          })
+        } finally {
+          txOpt = None
+        }
+    } finally {
+      batchGeneratedLatch.countDown()
+    }
+  }
+
+  /**
+   * Waits for upto transactionTimeout seconds for an ACK. If an ACK comes in
+   * this method commits the transaction with the channel. If the ACK does not come in within
+   * that time or a NACK comes in, this method rolls back the transaction.
+   */
+  private def processAckOrNack() {
+    batchAckLatch.await(transactionTimeout, TimeUnit.SECONDS)
+    txOpt.foreach(tx => {
+      if (batchSuccess) {
+        try {
+          logDebug("Committing transaction")
+          tx.commit()
+        } catch {
+          case e: Exception =>
+            logWarning("Error while attempting to commit transaction. Transaction will be rolled " +
+              "back", e)
+            rollbackAndClose(tx, close = false) // tx will be closed later anyway
+        } finally {
+          tx.close()
+        }
+      } else {
+        logWarning("Spark could not commit transaction, NACK received. Rolling back transaction.")
+        rollbackAndClose(tx, close = true)
+        // This might have been due to timeout or a NACK. Either way the following call does not
+        // cause issues. This is required to ensure the TransactionProcessor instance is not leaked
+        parent.removeAndGetProcessor(seqNum)
+      }
+    })
+  }
+
+  /**
+   * Helper method to rollback and optionally close a transaction
+   * @param tx The transaction to rollback
+   * @param close Whether the transaction should be closed or not after rolling back
+   */
+  private def rollbackAndClose(tx: Transaction, close: Boolean) {
+    try {
+      logWarning("Spark was unable to successfully process the events. Transaction is being " +
+        "rolled back.")
+      tx.rollback()
+    } catch {
+      case e: Exception =>
+        logError("Error rolling back transaction. Rollback may have failed!", e)
+    } finally {
+      if (close) {
+        tx.close()
+      }
+    }
+  }
+
+  /**
+   * Helper method to convert a Map[String, String] to Map[CharSequence, CharSequence]
+   * @param inMap The map to be converted
+   * @return The converted map
+   */
+  private def toCharSequenceMap(inMap: java.util.Map[String, String]): java.util.Map[CharSequence,
+    CharSequence] = {
+    val charSeqMap = new util.HashMap[CharSequence, CharSequence](inMap.size())
+    charSeqMap.putAll(inMap)
+    charSeqMap
+  }
+
+  /**
+   * When the thread is started it sets as many events as the batch size or less (if enough
+   * events aren't available) into the eventBatch and object and lets any threads waiting on the
+   * [[getEventBatch]] method to proceed. Then this thread waits for acks or nacks to come in,
+   * or for a specified timeout and commits or rolls back the transaction.
+   * @return
+   */
+  override def call(): Void = {
+    populateEvents()
+    processAckOrNack()
+    null
+  }
+}
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index c1f581967777b..c532705f3950c 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-streaming-flume_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-flume</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project External Flume</name>
   <url>http://spark.apache.org/</url>
@@ -69,11 +72,21 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming-flume-sink_2.10</artifactId>
+      <version>${project.version}</version>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
new file mode 100644
index 0000000000000..dc629df4f4ac2
--- /dev/null
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/EventTransformer.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.flume
+
+import java.io.{ObjectOutput, ObjectInput}
+
+import scala.collection.JavaConversions._
+
+import org.apache.spark.util.Utils
+import org.apache.spark.Logging
+
+/**
+ * A simple object that provides the implementation of readExternal and writeExternal for both
+ * the wrapper classes for Flume-style Events.
+ */
+private[streaming] object EventTransformer extends Logging {
+  def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence],
+    Array[Byte]) = {
+    val bodyLength = in.readInt()
+    val bodyBuff = new Array[Byte](bodyLength)
+    in.readFully(bodyBuff)
+
+    val numHeaders = in.readInt()
+    val headers = new java.util.HashMap[CharSequence, CharSequence]
+
+    for (i <- 0 until numHeaders) {
+      val keyLength = in.readInt()
+      val keyBuff = new Array[Byte](keyLength)
+      in.readFully(keyBuff)
+      val key: String = Utils.deserialize(keyBuff)
+
+      val valLength = in.readInt()
+      val valBuff = new Array[Byte](valLength)
+      in.readFully(valBuff)
+      val value: String = Utils.deserialize(valBuff)
+
+      headers.put(key, value)
+    }
+    (headers, bodyBuff)
+  }
+
+  def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence],
+    body: Array[Byte]) {
+    out.writeInt(body.length)
+    out.write(body)
+    val numHeaders = headers.size()
+    out.writeInt(numHeaders)
+    for ((k,v) <- headers) {
+      val keyBuff = Utils.serialize(k.toString)
+      out.writeInt(keyBuff.length)
+      out.write(keyBuff)
+      val valBuff = Utils.serialize(v.toString)
+      out.writeInt(valBuff.length)
+      out.write(valBuff)
+    }
+  }
+}
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
index ed35e34ad45ab..4b2ea45fb81d0 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
@@ -20,6 +20,7 @@ package org.apache.spark.streaming.flume
 import java.net.InetSocketAddress
 import java.io.{ObjectInput, ObjectOutput, Externalizable}
 import java.nio.ByteBuffer
+import java.util.concurrent.Executors
 
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
@@ -29,24 +30,29 @@ import org.apache.flume.source.avro.AvroFlumeEvent
 import org.apache.flume.source.avro.Status
 import org.apache.avro.ipc.specific.SpecificResponder
 import org.apache.avro.ipc.NettyServer
-
+import org.apache.spark.Logging
 import org.apache.spark.util.Utils
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.dstream._
-import org.apache.spark.Logging
+import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.receiver.Receiver
 
+import org.jboss.netty.channel.ChannelPipelineFactory
+import org.jboss.netty.channel.Channels
+import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory
+import org.jboss.netty.handler.codec.compression._
+
 private[streaming]
 class FlumeInputDStream[T: ClassTag](
   @transient ssc_ : StreamingContext,
   host: String,
   port: Int,
-  storageLevel: StorageLevel
+  storageLevel: StorageLevel,
+  enableDecompression: Boolean
 ) extends ReceiverInputDStream[SparkFlumeEvent](ssc_) {
 
   override def getReceiver(): Receiver[SparkFlumeEvent] = {
-    new FlumeReceiver(host, port, storageLevel)
+    new FlumeReceiver(host, port, storageLevel, enableDecompression)
   }
 }
 
@@ -134,22 +140,71 @@ private[streaming]
 class FlumeReceiver(
     host: String,
     port: Int,
-    storageLevel: StorageLevel
+    storageLevel: StorageLevel,
+    enableDecompression: Boolean
   ) extends Receiver[SparkFlumeEvent](storageLevel) with Logging {
 
   lazy val responder = new SpecificResponder(
     classOf[AvroSourceProtocol], new FlumeEventServer(this))
-  lazy val server = new NettyServer(responder, new InetSocketAddress(host, port))
+  var server: NettyServer = null
+
+  private def initServer() = {
+    if (enableDecompression) {
+      val channelFactory = new NioServerSocketChannelFactory(Executors.newCachedThreadPool(),
+                                                             Executors.newCachedThreadPool())
+      val channelPipelineFactory = new CompressionChannelPipelineFactory()
+      
+      new NettyServer(
+        responder, 
+        new InetSocketAddress(host, port),
+        channelFactory,
+        channelPipelineFactory,
+        null)
+    } else {
+      new NettyServer(responder, new InetSocketAddress(host, port))
+    }
+  }
 
   def onStart() {
-    server.start()
+    synchronized {
+      if (server == null) {
+        server = initServer()
+        server.start()
+      } else {
+        logWarning("Flume receiver being asked to start more then once with out close")
+      }
+    }
     logInfo("Flume receiver started")
   }
 
   def onStop() {
-    server.close()
+    synchronized {
+      if (server != null) {
+        server.close()
+        server = null
+      }
+    }
     logInfo("Flume receiver stopped")
   }
 
   override def preferredLocation = Some(host)
+  
+  /** A Netty Pipeline factory that will decompress incoming data from 
+    * and the Netty client and compress data going back to the client.
+    *
+    * The compression on the return is required because Flume requires
+    * a successful response to indicate it can remove the event/batch 
+    * from the configured channel 
+    */
+  private[streaming]
+  class CompressionChannelPipelineFactory extends ChannelPipelineFactory {
+
+    def getPipeline() = {
+      val pipeline = Channels.pipeline()
+      val encoder = new ZlibEncoder(6)
+      pipeline.addFirst("deflater", encoder)
+      pipeline.addFirst("inflater", new ZlibDecoder())
+      pipeline
+  }
+}
 }
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
new file mode 100644
index 0000000000000..148262bb6771e
--- /dev/null
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.flume
+
+
+import java.net.InetSocketAddress
+import java.util.concurrent.{LinkedBlockingQueue, TimeUnit, Executors}
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder
+import org.apache.avro.ipc.NettyTransceiver
+import org.apache.avro.ipc.specific.SpecificRequestor
+import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
+
+import org.apache.spark.Logging
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.streaming.receiver.Receiver
+import org.apache.spark.streaming.flume.sink._
+
+/**
+ * A [[ReceiverInputDStream]] that can be used to read data from several Flume agents running
+ * [[org.apache.spark.streaming.flume.sink.SparkSink]]s.
+ * @param _ssc Streaming context that will execute this input stream
+ * @param addresses List of addresses at which SparkSinks are listening
+ * @param maxBatchSize Maximum size of a batch
+ * @param parallelism Number of parallel connections to open
+ * @param storageLevel The storage level to use.
+ * @tparam T Class type of the object of this stream
+ */
+private[streaming] class FlumePollingInputDStream[T: ClassTag](
+    @transient _ssc: StreamingContext,
+    val addresses: Seq[InetSocketAddress],
+    val maxBatchSize: Int,
+    val parallelism: Int,
+    storageLevel: StorageLevel
+  ) extends ReceiverInputDStream[SparkFlumeEvent](_ssc) {
+
+  override def getReceiver(): Receiver[SparkFlumeEvent] = {
+    new FlumePollingReceiver(addresses, maxBatchSize, parallelism, storageLevel)
+  }
+}
+
+private[streaming] class FlumePollingReceiver(
+    addresses: Seq[InetSocketAddress],
+    maxBatchSize: Int,
+    parallelism: Int,
+    storageLevel: StorageLevel
+  ) extends Receiver[SparkFlumeEvent](storageLevel) with Logging {
+
+  lazy val channelFactoryExecutor =
+    Executors.newCachedThreadPool(new ThreadFactoryBuilder().setDaemon(true).
+      setNameFormat("Flume Receiver Channel Thread - %d").build())
+
+  lazy val channelFactory =
+    new NioClientSocketChannelFactory(channelFactoryExecutor, channelFactoryExecutor)
+
+  lazy val receiverExecutor = Executors.newFixedThreadPool(parallelism,
+    new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Flume Receiver Thread - %d").build())
+
+  private lazy val connections = new LinkedBlockingQueue[FlumeConnection]()
+
+  override def onStart(): Unit = {
+    // Create the connections to each Flume agent.
+    addresses.foreach(host => {
+      val transceiver = new NettyTransceiver(host, channelFactory)
+      val client = SpecificRequestor.getClient(classOf[SparkFlumeProtocol.Callback], transceiver)
+      connections.add(new FlumeConnection(transceiver, client))
+    })
+    for (i <- 0 until parallelism) {
+      logInfo("Starting Flume Polling Receiver worker threads starting..")
+      // Threads that pull data from Flume.
+      receiverExecutor.submit(new Runnable {
+        override def run(): Unit = {
+          while (true) {
+            val connection = connections.poll()
+            val client = connection.client
+            try {
+              val eventBatch = client.getEventBatch(maxBatchSize)
+              if (!SparkSinkUtils.isErrorBatch(eventBatch)) {
+                // No error, proceed with processing data
+                val seq = eventBatch.getSequenceNumber
+                val events: java.util.List[SparkSinkEvent] = eventBatch.getEvents
+                logDebug(
+                  "Received batch of " + events.size() + " events with sequence number: " + seq)
+                try {
+                  // Convert each Flume event to a serializable SparkFlumeEvent
+                  val buffer = new ArrayBuffer[SparkFlumeEvent](events.size())
+                  var j = 0
+                  while (j < events.size()) {
+                    buffer += toSparkFlumeEvent(events(j))
+                    j += 1
+                  }
+                  store(buffer)
+                  logDebug("Sending ack for sequence number: " + seq)
+                  // Send an ack to Flume so that Flume discards the events from its channels.
+                  client.ack(seq)
+                  logDebug("Ack sent for sequence number: " + seq)
+                } catch {
+                  case e: Exception =>
+                    try {
+                      // Let Flume know that the events need to be pushed back into the channel.
+                      logDebug("Sending nack for sequence number: " + seq)
+                      client.nack(seq) // If the agent is down, even this could fail and throw
+                      logDebug("Nack sent for sequence number: " + seq)
+                    } catch {
+                      case e: Exception => logError(
+                        "Sending Nack also failed. A Flume agent is down.")
+                    }
+                    TimeUnit.SECONDS.sleep(2L) // for now just leave this as a fixed 2 seconds.
+                    logWarning("Error while attempting to store events", e)
+                }
+              } else {
+                logWarning("Did not receive events from Flume agent due to error on the Flume " +
+                  "agent: " + eventBatch.getErrorMsg)
+              }
+            } catch {
+              case e: Exception =>
+                logWarning("Error while reading data from Flume", e)
+            } finally {
+              connections.add(connection)
+            }
+          }
+        }
+      })
+    }
+  }
+
+  override def onStop(): Unit = {
+    logInfo("Shutting down Flume Polling Receiver")
+    receiverExecutor.shutdownNow()
+    connections.foreach(connection => {
+      connection.transceiver.close()
+    })
+    channelFactory.releaseExternalResources()
+  }
+
+  /**
+   * Utility method to convert [[SparkSinkEvent]] to [[SparkFlumeEvent]]
+   * @param event - Event to convert to SparkFlumeEvent
+   * @return - The SparkFlumeEvent generated from SparkSinkEvent
+   */
+  private def toSparkFlumeEvent(event: SparkSinkEvent): SparkFlumeEvent = {
+    val sparkFlumeEvent = new SparkFlumeEvent()
+    sparkFlumeEvent.event.setBody(event.getBody)
+    sparkFlumeEvent.event.setHeaders(event.getHeaders)
+    sparkFlumeEvent
+  }
+}
+
+/**
+ * A wrapper around the transceiver and the Avro IPC API. 
+ * @param transceiver The transceiver to use for communication with Flume
+ * @param client The client that the callbacks are received on.
+ */
+private class FlumeConnection(val transceiver: NettyTransceiver,
+  val client: SparkFlumeProtocol.Callback)
+
+
+
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
index 499f3560ef768..4b732c1592ab2 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
@@ -17,12 +17,19 @@
 
 package org.apache.spark.streaming.flume
 
+import java.net.InetSocketAddress
+
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaInputDStream, JavaStreamingContext, JavaDStream}
-import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream}
+import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
+
 
 object FlumeUtils {
+  private val DEFAULT_POLLING_PARALLELISM = 5
+  private val DEFAULT_POLLING_BATCH_SIZE = 1000
+
   /**
    * Create a input stream from a Flume source.
    * @param ssc      StreamingContext object
@@ -36,7 +43,27 @@ object FlumeUtils {
       port: Int,
       storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
     ): ReceiverInputDStream[SparkFlumeEvent] = {
-    val inputStream = new FlumeInputDStream[SparkFlumeEvent](ssc, hostname, port, storageLevel)
+    createStream(ssc, hostname, port, storageLevel, false)
+  }
+
+  /**
+   * Create a input stream from a Flume source.
+   * @param ssc      StreamingContext object
+   * @param hostname Hostname of the slave machine to which the flume data will be sent
+   * @param port     Port of the slave machine to which the flume data will be sent
+   * @param storageLevel  Storage level to use for storing the received objects
+   * @param enableDecompression  should netty server decompress input stream
+   */
+  def createStream (
+      ssc: StreamingContext,
+      hostname: String,
+      port: Int,
+      storageLevel: StorageLevel,
+      enableDecompression: Boolean
+    ): ReceiverInputDStream[SparkFlumeEvent] = {
+    val inputStream = new FlumeInputDStream[SparkFlumeEvent](
+        ssc, hostname, port, storageLevel, enableDecompression)
+
     inputStream
   }
 
@@ -66,6 +93,154 @@ object FlumeUtils {
       port: Int,
       storageLevel: StorageLevel
     ): JavaReceiverInputDStream[SparkFlumeEvent] = {
-    createStream(jssc.ssc, hostname, port, storageLevel)
+    createStream(jssc.ssc, hostname, port, storageLevel, false)
+  }
+
+  /**
+   * Creates a input stream from a Flume source.
+   * @param hostname Hostname of the slave machine to which the flume data will be sent
+   * @param port     Port of the slave machine to which the flume data will be sent
+   * @param storageLevel  Storage level to use for storing the received objects
+   * @param enableDecompression  should netty server decompress input stream
+   */
+  def createStream(
+      jssc: JavaStreamingContext,
+      hostname: String,
+      port: Int,
+      storageLevel: StorageLevel,
+      enableDecompression: Boolean
+    ): JavaReceiverInputDStream[SparkFlumeEvent] = {
+    createStream(jssc.ssc, hostname, port, storageLevel, enableDecompression)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * This stream will use a batch size of 1000 events and run 5 threads to pull data.
+   * @param hostname Address of the host on which the Spark Sink is running
+   * @param port Port of the host at which the Spark Sink is listening
+   * @param storageLevel Storage level to use for storing the received objects
+   */
+  @Experimental
+  def createPollingStream(
+      ssc: StreamingContext,
+      hostname: String,
+      port: Int,
+      storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
+    ): ReceiverInputDStream[SparkFlumeEvent] = {
+    createPollingStream(ssc, Seq(new InetSocketAddress(hostname, port)), storageLevel)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * This stream will use a batch size of 1000 events and run 5 threads to pull data.
+   * @param addresses List of InetSocketAddresses representing the hosts to connect to.
+   * @param storageLevel Storage level to use for storing the received objects
+   */
+  @Experimental
+  def createPollingStream(
+      ssc: StreamingContext,
+      addresses: Seq[InetSocketAddress],
+      storageLevel: StorageLevel
+    ): ReceiverInputDStream[SparkFlumeEvent] = {
+    createPollingStream(ssc, addresses, storageLevel,
+      DEFAULT_POLLING_BATCH_SIZE, DEFAULT_POLLING_PARALLELISM)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * @param addresses List of InetSocketAddresses representing the hosts to connect to.
+   * @param maxBatchSize Maximum number of events to be pulled from the Spark sink in a
+   *                     single RPC call
+   * @param parallelism Number of concurrent requests this stream should send to the sink. Note
+   *                    that having a higher number of requests concurrently being pulled will
+   *                    result in this stream using more threads
+   * @param storageLevel Storage level to use for storing the received objects
+   */
+  @Experimental
+  def createPollingStream(
+      ssc: StreamingContext,
+      addresses: Seq[InetSocketAddress],
+      storageLevel: StorageLevel,
+      maxBatchSize: Int,
+      parallelism: Int
+    ): ReceiverInputDStream[SparkFlumeEvent] = {
+    new FlumePollingInputDStream[SparkFlumeEvent](ssc, addresses, maxBatchSize,
+      parallelism, storageLevel)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * This stream will use a batch size of 1000 events and run 5 threads to pull data.
+   * @param hostname Hostname of the host on which the Spark Sink is running
+   * @param port     Port of the host at which the Spark Sink is listening
+   */
+  @Experimental
+  def createPollingStream(
+      jssc: JavaStreamingContext,
+      hostname: String,
+      port: Int
+    ): JavaReceiverInputDStream[SparkFlumeEvent] = {
+    createPollingStream(jssc, hostname, port, StorageLevel.MEMORY_AND_DISK_SER_2)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * This stream will use a batch size of 1000 events and run 5 threads to pull data.
+   * @param hostname     Hostname of the host on which the Spark Sink is running
+   * @param port         Port of the host at which the Spark Sink is listening
+   * @param storageLevel Storage level to use for storing the received objects
+   */
+  @Experimental
+  def createPollingStream(
+      jssc: JavaStreamingContext,
+      hostname: String,
+      port: Int,
+      storageLevel: StorageLevel
+    ): JavaReceiverInputDStream[SparkFlumeEvent] = {
+    createPollingStream(jssc, Array(new InetSocketAddress(hostname, port)), storageLevel)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * This stream will use a batch size of 1000 events and run 5 threads to pull data.
+   * @param addresses    List of InetSocketAddresses on which the Spark Sink is running.
+   * @param storageLevel Storage level to use for storing the received objects
+   */
+  @Experimental
+  def createPollingStream(
+      jssc: JavaStreamingContext,
+      addresses: Array[InetSocketAddress],
+      storageLevel: StorageLevel
+    ): JavaReceiverInputDStream[SparkFlumeEvent] = {
+    createPollingStream(jssc, addresses, storageLevel,
+      DEFAULT_POLLING_BATCH_SIZE, DEFAULT_POLLING_PARALLELISM)
+  }
+
+  /**
+   * Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent.
+   * This stream will poll the sink for data and will pull events as they are available.
+   * @param addresses    List of InetSocketAddresses on which the Spark Sink is running
+   * @param maxBatchSize The maximum number of events to be pulled from the Spark sink in a
+   *                     single RPC call
+   * @param parallelism  Number of concurrent requests this stream should send to the sink. Note
+   *                     that having a higher number of requests concurrently being pulled will
+   *                     result in this stream using more threads
+   * @param storageLevel Storage level to use for storing the received objects
+   */
+  @Experimental
+  def createPollingStream(
+      jssc: JavaStreamingContext,
+      addresses: Array[InetSocketAddress],
+      storageLevel: StorageLevel,
+      maxBatchSize: Int,
+      parallelism: Int
+    ): JavaReceiverInputDStream[SparkFlumeEvent] = {
+    createPollingStream(jssc.ssc, addresses, storageLevel, maxBatchSize, parallelism)
   }
 }
diff --git a/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumePollingStreamSuite.java b/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumePollingStreamSuite.java
new file mode 100644
index 0000000000000..79c5b91654b42
--- /dev/null
+++ b/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumePollingStreamSuite.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.flume;
+
+import java.net.InetSocketAddress;
+
+import org.apache.spark.storage.StorageLevel;
+import org.apache.spark.streaming.LocalJavaStreamingContext;
+
+import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
+import org.junit.Test;
+
+public class JavaFlumePollingStreamSuite extends LocalJavaStreamingContext {
+  @Test
+  public void testFlumeStream() {
+    // tests the API, does not actually test data receiving
+    InetSocketAddress[] addresses = new InetSocketAddress[] {
+        new InetSocketAddress("localhost", 12345)
+    };
+    JavaReceiverInputDStream<SparkFlumeEvent> test1 =
+        FlumeUtils.createPollingStream(ssc, "localhost", 12345);
+    JavaReceiverInputDStream<SparkFlumeEvent> test2 = FlumeUtils.createPollingStream(
+        ssc, "localhost", 12345, StorageLevel.MEMORY_AND_DISK_SER_2());
+    JavaReceiverInputDStream<SparkFlumeEvent> test3 = FlumeUtils.createPollingStream(
+        ssc, addresses, StorageLevel.MEMORY_AND_DISK_SER_2());
+    JavaReceiverInputDStream<SparkFlumeEvent> test4 = FlumeUtils.createPollingStream(
+        ssc, addresses, StorageLevel.MEMORY_AND_DISK_SER_2(), 100, 5);
+  }
+}
diff --git a/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumeStreamSuite.java b/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumeStreamSuite.java
index e0ad4f1015205..3b5e0c7746b2c 100644
--- a/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumeStreamSuite.java
+++ b/external/flume/src/test/java/org/apache/spark/streaming/flume/JavaFlumeStreamSuite.java
@@ -30,5 +30,7 @@ public void testFlumeStream() {
     JavaReceiverInputDStream<SparkFlumeEvent> test1 = FlumeUtils.createStream(ssc, "localhost", 12345);
     JavaReceiverInputDStream<SparkFlumeEvent> test2 = FlumeUtils.createStream(ssc, "localhost", 12345,
       StorageLevel.MEMORY_AND_DISK_SER_2());
+    JavaReceiverInputDStream<SparkFlumeEvent> test3 = FlumeUtils.createStream(ssc, "localhost", 12345,
+      StorageLevel.MEMORY_AND_DISK_SER_2(), false);
   }
 }
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
new file mode 100644
index 0000000000000..27bf2ac962721
--- /dev/null
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.streaming.flume
+
+import java.net.InetSocketAddress
+import java.util.concurrent.{Callable, ExecutorCompletionService, Executors}
+import java.util.Random
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
+
+import org.apache.flume.Context
+import org.apache.flume.channel.MemoryChannel
+import org.apache.flume.conf.Configurables
+import org.apache.flume.event.EventBuilder
+
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.streaming.util.ManualClock
+import org.apache.spark.streaming.{TestSuiteBase, TestOutputStream, StreamingContext}
+import org.apache.spark.streaming.flume.sink._
+
+class FlumePollingStreamSuite extends TestSuiteBase {
+
+  val random = new Random()
+  /** Return a port in the ephemeral range. */
+  def getTestPort = random.nextInt(16382) + 49152
+  val batchCount = 5
+  val eventsPerBatch = 100
+  val totalEventsPerChannel = batchCount * eventsPerBatch
+  val channelCapacity = 5000
+
+  test("flume polling test") {
+    val testPort = getTestPort
+    // Set up the streaming context and input streams
+    val ssc = new StreamingContext(conf, batchDuration)
+    val flumeStream: ReceiverInputDStream[SparkFlumeEvent] =
+      FlumeUtils.createPollingStream(ssc, Seq(new InetSocketAddress("localhost", testPort)),
+        StorageLevel.MEMORY_AND_DISK, eventsPerBatch, 1)
+    val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]]
+      with SynchronizedBuffer[Seq[SparkFlumeEvent]]
+    val outputStream = new TestOutputStream(flumeStream, outputBuffer)
+    outputStream.register()
+
+    // Start the channel and sink.
+    val context = new Context()
+    context.put("capacity", channelCapacity.toString)
+    context.put("transactionCapacity", "1000")
+    context.put("keep-alive", "0")
+    val channel = new MemoryChannel()
+    Configurables.configure(channel, context)
+
+    val sink = new SparkSink()
+    context.put(SparkSinkConfig.CONF_HOSTNAME, "localhost")
+    context.put(SparkSinkConfig.CONF_PORT, String.valueOf(testPort))
+    Configurables.configure(sink, context)
+    sink.setChannel(channel)
+    sink.start()
+    ssc.start()
+
+    writeAndVerify(Seq(channel), ssc, outputBuffer)
+    assertChannelIsEmpty(channel)
+    sink.stop()
+    channel.stop()
+  }
+
+  test("flume polling test multiple hosts") {
+    val testPort = getTestPort
+    // Set up the streaming context and input streams
+    val ssc = new StreamingContext(conf, batchDuration)
+    val addresses = Seq(testPort, testPort + 1).map(new InetSocketAddress("localhost", _))
+    val flumeStream: ReceiverInputDStream[SparkFlumeEvent] =
+      FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK,
+        eventsPerBatch, 5)
+    val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]]
+      with SynchronizedBuffer[Seq[SparkFlumeEvent]]
+    val outputStream = new TestOutputStream(flumeStream, outputBuffer)
+    outputStream.register()
+
+    // Start the channel and sink.
+    val context = new Context()
+    context.put("capacity", channelCapacity.toString)
+    context.put("transactionCapacity", "1000")
+    context.put("keep-alive", "0")
+    val channel = new MemoryChannel()
+    Configurables.configure(channel, context)
+
+    val channel2 = new MemoryChannel()
+    Configurables.configure(channel2, context)
+
+    val sink = new SparkSink()
+    context.put(SparkSinkConfig.CONF_HOSTNAME, "localhost")
+    context.put(SparkSinkConfig.CONF_PORT, String.valueOf(testPort))
+    Configurables.configure(sink, context)
+    sink.setChannel(channel)
+    sink.start()
+
+    val sink2 = new SparkSink()
+    context.put(SparkSinkConfig.CONF_HOSTNAME, "localhost")
+    context.put(SparkSinkConfig.CONF_PORT, String.valueOf(testPort + 1))
+    Configurables.configure(sink2, context)
+    sink2.setChannel(channel2)
+    sink2.start()
+    ssc.start()
+    writeAndVerify(Seq(channel, channel2), ssc, outputBuffer)
+    assertChannelIsEmpty(channel)
+    assertChannelIsEmpty(channel2)
+    sink.stop()
+    channel.stop()
+  }
+
+  def writeAndVerify(channels: Seq[MemoryChannel], ssc: StreamingContext,
+    outputBuffer: ArrayBuffer[Seq[SparkFlumeEvent]]) {
+    val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+    val executor = Executors.newCachedThreadPool()
+    val executorCompletion = new ExecutorCompletionService[Void](executor)
+    channels.map(channel => {
+      executorCompletion.submit(new TxnSubmitter(channel, clock))
+    })
+    for (i <- 0 until channels.size) {
+      executorCompletion.take()
+    }
+    val startTime = System.currentTimeMillis()
+    while (outputBuffer.size < batchCount * channels.size &&
+      System.currentTimeMillis() - startTime < 15000) {
+      logInfo("output.size = " + outputBuffer.size)
+      Thread.sleep(100)
+    }
+    val timeTaken = System.currentTimeMillis() - startTime
+    assert(timeTaken < 15000, "Operation timed out after " + timeTaken + " ms")
+    logInfo("Stopping context")
+    ssc.stop()
+
+    val flattenedBuffer = outputBuffer.flatten
+    assert(flattenedBuffer.size === totalEventsPerChannel * channels.size)
+    var counter = 0
+    for (k <- 0 until channels.size; i <- 0 until totalEventsPerChannel) {
+      val eventToVerify = EventBuilder.withBody((channels(k).getName + " - " +
+        String.valueOf(i)).getBytes("utf-8"),
+        Map[String, String]("test-" + i.toString -> "header"))
+      var found = false
+      var j = 0
+      while (j < flattenedBuffer.size && !found) {
+        val strToCompare = new String(flattenedBuffer(j).event.getBody.array(), "utf-8")
+        if (new String(eventToVerify.getBody, "utf-8") == strToCompare &&
+          eventToVerify.getHeaders.get("test-" + i.toString)
+            .equals(flattenedBuffer(j).event.getHeaders.get("test-" + i.toString))) {
+          found = true
+          counter += 1
+        }
+        j += 1
+      }
+    }
+    assert(counter === totalEventsPerChannel * channels.size)
+  }
+
+  def assertChannelIsEmpty(channel: MemoryChannel) = {
+    val queueRemaining = channel.getClass.getDeclaredField("queueRemaining");
+    queueRemaining.setAccessible(true)
+    val m = queueRemaining.get(channel).getClass.getDeclaredMethod("availablePermits")
+    assert(m.invoke(queueRemaining.get(channel)).asInstanceOf[Int] === 5000)
+  }
+
+  private class TxnSubmitter(channel: MemoryChannel, clock: ManualClock) extends Callable[Void] {
+    override def call(): Void = {
+      var t = 0
+      for (i <- 0 until batchCount) {
+        val tx = channel.getTransaction
+        tx.begin()
+        for (j <- 0 until eventsPerBatch) {
+          channel.put(EventBuilder.withBody((channel.getName + " - " + String.valueOf(t)).getBytes(
+            "utf-8"),
+            Map[String, String]("test-" + t.toString -> "header")))
+          t += 1
+        }
+        tx.commit()
+        tx.close()
+        Thread.sleep(500) // Allow some time for the events to reach
+        clock.addToTime(batchDuration.milliseconds)
+      }
+      null
+    }
+  }
+}
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
index dd287d0ef90a0..73dffef953309 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
@@ -33,15 +33,26 @@ import org.apache.spark.streaming.{TestOutputStream, StreamingContext, TestSuite
 import org.apache.spark.streaming.util.ManualClock
 import org.apache.spark.streaming.api.java.JavaReceiverInputDStream
 
-class FlumeStreamSuite extends TestSuiteBase {
+import org.jboss.netty.channel.ChannelPipeline
+import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
+import org.jboss.netty.channel.socket.SocketChannel
+import org.jboss.netty.handler.codec.compression._
 
-  val testPort = 9999
+class FlumeStreamSuite extends TestSuiteBase {
 
   test("flume input stream") {
+    runFlumeStreamTest(false, 9998)
+  }
+
+  test("flume input compressed stream") {
+    runFlumeStreamTest(true, 9997)
+  }
+  
+  def runFlumeStreamTest(enableDecompression: Boolean, testPort: Int) {
     // Set up the streaming context and input streams
     val ssc = new StreamingContext(conf, batchDuration)
     val flumeStream: JavaReceiverInputDStream[SparkFlumeEvent] =
-      FlumeUtils.createStream(ssc, "localhost", testPort, StorageLevel.MEMORY_AND_DISK)
+      FlumeUtils.createStream(ssc, "localhost", testPort, StorageLevel.MEMORY_AND_DISK, enableDecompression)
     val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]]
       with SynchronizedBuffer[Seq[SparkFlumeEvent]]
     val outputStream = new TestOutputStream(flumeStream.receiverInputDStream, outputBuffer)
@@ -52,8 +63,17 @@ class FlumeStreamSuite extends TestSuiteBase {
     val input = Seq(1, 2, 3, 4, 5)
     Thread.sleep(1000)
     val transceiver = new NettyTransceiver(new InetSocketAddress("localhost", testPort))
-    val client = SpecificRequestor.getClient(
-      classOf[AvroSourceProtocol], transceiver)
+    var client: AvroSourceProtocol = null;
+  
+    if (enableDecompression) {
+      client = SpecificRequestor.getClient(
+          classOf[AvroSourceProtocol], 
+          new NettyTransceiver(new InetSocketAddress("localhost", testPort), 
+          new CompressionChannelFactory(6)));
+    } else {
+      client = SpecificRequestor.getClient(
+        classOf[AvroSourceProtocol], transceiver)
+    }
 
     for (i <- 0 until input.size) {
       val event = new AvroFlumeEvent
@@ -64,6 +84,8 @@ class FlumeStreamSuite extends TestSuiteBase {
       clock.addToTime(batchDuration.milliseconds)
     }
 
+    Thread.sleep(1000)
+
     val startTime = System.currentTimeMillis()
     while (outputBuffer.size < input.size && System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
       logInfo("output.size = " + outputBuffer.size + ", input.size = " + input.size)
@@ -85,4 +107,13 @@ class FlumeStreamSuite extends TestSuiteBase {
       assert(outputBuffer(i).head.event.getHeaders.get("test") === "header")
     }
   }
+
+  class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory {
+    override def newChannel(pipeline:ChannelPipeline) : SocketChannel = {
+      var encoder : ZlibEncoder = new ZlibEncoder(compressionLevel);
+      pipeline.addFirst("deflater", encoder);
+      pipeline.addFirst("inflater", new ZlibDecoder());
+      super.newChannel(pipeline);
+    }
+  }
 }
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index d014a7aad0fca..daf03360bc5f5 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-streaming-kafka_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-kafka</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project External Kafka</name>
   <url>http://spark.apache.org/</url>
@@ -77,6 +80,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index 38095e88dcea9..e20e2c8f26991 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.streaming.kafka
 
 import scala.collection.Map
-import scala.reflect.ClassTag
+import scala.reflect.{classTag, ClassTag}
 
 import java.util.Properties
 import java.util.concurrent.Executors
@@ -48,8 +48,8 @@ private[streaming]
 class KafkaInputDStream[
   K: ClassTag,
   V: ClassTag,
-  U <: Decoder[_]: Manifest,
-  T <: Decoder[_]: Manifest](
+  U <: Decoder[_]: ClassTag,
+  T <: Decoder[_]: ClassTag](
     @transient ssc_ : StreamingContext,
     kafkaParams: Map[String, String],
     topics: Map[String, Int],
@@ -66,8 +66,8 @@ private[streaming]
 class KafkaReceiver[
   K: ClassTag,
   V: ClassTag,
-  U <: Decoder[_]: Manifest,
-  T <: Decoder[_]: Manifest](
+  U <: Decoder[_]: ClassTag,
+  T <: Decoder[_]: ClassTag](
     kafkaParams: Map[String, String],
     topics: Map[String, Int],
     storageLevel: StorageLevel
@@ -103,10 +103,10 @@ class KafkaReceiver[
       tryZookeeperConsumerGroupCleanup(zkConnect, kafkaParams("group.id"))
     }
 
-    val keyDecoder = manifest[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
+    val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
       .newInstance(consumerConfig.props)
       .asInstanceOf[Decoder[K]]
-    val valueDecoder = manifest[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
+    val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
       .newInstance(consumerConfig.props)
       .asInstanceOf[Decoder[V]]
 
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 86bb91f362d29..48668f763e41e 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -65,7 +65,7 @@ object KafkaUtils {
    *                    in its own thread.
    * @param storageLevel Storage level to use for storing the received objects
    */
-  def createStream[K: ClassTag, V: ClassTag, U <: Decoder[_]: Manifest, T <: Decoder[_]: Manifest](
+  def createStream[K: ClassTag, V: ClassTag, U <: Decoder[_]: ClassTag, T <: Decoder[_]: ClassTag](
       ssc: StreamingContext,
       kafkaParams: Map[String, String],
       topics: Map[String, Int],
@@ -89,8 +89,6 @@ object KafkaUtils {
       groupId: String,
       topics: JMap[String, JInt]
     ): JavaPairReceiverInputDStream[String, String] = {
-    implicit val cmt: ClassTag[String] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
     createStream(jssc.ssc, zkQuorum, groupId, Map(topics.mapValues(_.intValue()).toSeq: _*))
   }
 
@@ -111,8 +109,6 @@ object KafkaUtils {
       topics: JMap[String, JInt],
       storageLevel: StorageLevel
     ): JavaPairReceiverInputDStream[String, String] = {
-    implicit val cmt: ClassTag[String] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
     createStream(jssc.ssc, zkQuorum, groupId, Map(topics.mapValues(_.intValue()).toSeq: _*),
       storageLevel)
   }
@@ -140,13 +136,11 @@ object KafkaUtils {
       topics: JMap[String, JInt],
       storageLevel: StorageLevel
     ): JavaPairReceiverInputDStream[K, V] = {
-    implicit val keyCmt: ClassTag[K] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
-    implicit val valueCmt: ClassTag[V] =
-      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyTypeClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueTypeClass)
 
-    implicit val keyCmd: Manifest[U] = implicitly[Manifest[AnyRef]].asInstanceOf[Manifest[U]]
-    implicit val valueCmd: Manifest[T] = implicitly[Manifest[AnyRef]].asInstanceOf[Manifest[T]]
+    implicit val keyCmd: ClassTag[U] = ClassTag(keyDecoderClass)
+    implicit val valueCmd: ClassTag[T] = ClassTag(valueDecoderClass)
 
     createStream[K, V, U, T](
       jssc.ssc, kafkaParams.toMap, Map(topics.mapValues(_.intValue()).toSeq: _*), storageLevel)
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 4980208cba3b0..dc48a08c93de2 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-streaming-mqtt_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-mqtt</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project External MQTT</name>
   <url>http://spark.apache.org/</url>
@@ -64,6 +67,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 7073bd4404d9c..b93ad016f84f0 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-streaming-twitter_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-twitter</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project External Twitter</name>
   <url>http://spark.apache.org/</url>
@@ -59,6 +62,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
index 5ea2e5549d7df..4eacc47da5699 100644
--- a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
+++ b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
@@ -63,7 +63,8 @@ class TwitterReceiver(
     storageLevel: StorageLevel
   ) extends Receiver[Status](storageLevel) with Logging {
 
-  private var twitterStream: TwitterStream = _
+  @volatile private var twitterStream: TwitterStream = _
+  @volatile private var stopped = false
 
   def onStart() {
     try {
@@ -78,7 +79,9 @@ class TwitterReceiver(
         def onScrubGeo(l: Long, l1: Long) {}
         def onStallWarning(stallWarning: StallWarning) {}
         def onException(e: Exception) {
-          restart("Error receiving tweets", e)
+          if (!stopped) {
+            restart("Error receiving tweets", e)
+          }
         }
       })
 
@@ -91,12 +94,14 @@ class TwitterReceiver(
       }
       setTwitterStream(newTwitterStream)
       logInfo("Twitter receiver started")
+      stopped = false
     } catch {
       case e: Exception => restart("Error starting Twitter stream", e)
     }
   }
 
   def onStop() {
+    stopped = true
     setTwitterStream(null)
     logInfo("Twitter receiver stopped")
   }
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index cf306e0dca8bd..22c1fff23d9a2 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-streaming-zeromq_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-zeromq</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project External ZeroMQ</name>
   <url>http://spark.apache.org/</url>
@@ -59,6 +62,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 955ec1a8c3033..5308bb4e440ea 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -28,7 +28,11 @@
   <artifactId>java8-tests_2.10</artifactId>
   <packaging>pom</packaging>
   <name>Spark Project Java8 Tests POM</name>
-  
+
+  <properties>
+    <sbt.project.name>java8-tests</sbt.project.name>
+  </properties>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -46,6 +50,11 @@
       <version>${project.version}</version>
       <type>test-jar</type>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
new file mode 100644
index 0000000000000..a54b34235dfb4
--- /dev/null
+++ b/extras/kinesis-asl/pom.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+~ Licensed to the Apache Software Foundation (ASF) under one or more
+~ contributor license agreements.  See the NOTICE file distributed with
+~ this work for additional information regarding copyright ownership.
+~ The ASF licenses this file to You under the Apache License, Version 2.0
+~ (the "License"); you may not use this file except in compliance with
+~ the License.  You may obtain a copy of the License at
+~
+~    http://www.apache.org/licenses/LICENSE-2.0
+~
+~ Unless required by applicable law or agreed to in writing, software
+~ distributed under the License is distributed on an "AS IS" BASIS,
+~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~ See the License for the specific language governing permissions and
+~ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>1.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <!-- Kinesis integration is not included by default due to ASL-licensed code. -->
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-streaming-kinesis-asl_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Kinesis Integration</name>
+
+  <properties>
+    <sbt.project.name>kinesis-asl</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>amazon-kinesis-client</artifactId>
+      <version>${aws.kinesis.client.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>aws-java-sdk</artifactId>
+      <version>${aws.java.sdk.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymockclassextension</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
new file mode 100644
index 0000000000000..a8b907b241893
--- /dev/null
+++ b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.streaming;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.storage.StorageLevel;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import org.apache.spark.streaming.kinesis.KinesisUtils;
+
+import scala.Tuple2;
+
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
+import com.amazonaws.services.kinesis.AmazonKinesisClient;
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+import com.google.common.collect.Lists;
+
+/**
+ * Java-friendly Kinesis Spark Streaming WordCount example
+ *
+ * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details
+ * on the Kinesis Spark Streaming integration.
+ *
+ * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard
+ *   for the given stream.
+ * It then starts pulling from the last checkpointed sequence number of the given
+ *   <stream-name> and <endpoint-url>. 
+ *
+ * Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region
+ *
+ * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials 
+ *  in the following order of precedence: 
+ *         Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+ *         Java System Properties - aws.accessKeyId and aws.secretKey
+ *         Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+ *         Instance profile credentials - delivered through the Amazon EC2 metadata service
+ *
+ * Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>
+ *         <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
+ *         <endpoint-url> is the endpoint of the Kinesis service 
+ *           (ie. https://kinesis.us-east-1.amazonaws.com)
+ *
+ * Example:
+ *      $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *      $ export AWS_SECRET_KEY=<your-secret-key>
+ *      $ $SPARK_HOME/bin/run-example \
+ *            org.apache.spark.examples.streaming.JavaKinesisWordCountASL mySparkStream \
+ *            https://kinesis.us-east-1.amazonaws.com
+ *
+ * There is a companion helper class called KinesisWordCountProducerASL which puts dummy data 
+ *   onto the Kinesis stream. 
+ * Usage instructions for KinesisWordCountProducerASL are provided in the class definition.
+ */
+public final class JavaKinesisWordCountASL {
+    private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");
+    private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class);
+
+    /* Make the constructor private to enforce singleton */
+    private JavaKinesisWordCountASL() {
+    }
+
+    public static void main(String[] args) {
+        /* Check that all required args were passed in. */
+        if (args.length < 2) {
+          System.err.println(
+              "|Usage: KinesisWordCount <stream-name> <endpoint-url>\n" +
+              "|    <stream-name> is the name of the Kinesis stream\n" +
+              "|    <endpoint-url> is the endpoint of the Kinesis service\n" +
+              "|                   (e.g. https://kinesis.us-east-1.amazonaws.com)\n");
+          System.exit(1);
+        }
+
+        StreamingExamples.setStreamingLogLevels();
+
+        /* Populate the appropriate variables from the given args */
+        String streamName = args[0];
+        String endpointUrl = args[1];
+        /* Set the batch interval to a fixed 2000 millis (2 seconds) */
+        Duration batchInterval = new Duration(2000);
+
+        /* Create a Kinesis client in order to determine the number of shards for the given stream */
+        AmazonKinesisClient kinesisClient = new AmazonKinesisClient(
+                new DefaultAWSCredentialsProviderChain());
+        kinesisClient.setEndpoint(endpointUrl);
+
+        /* Determine the number of shards from the stream */
+        int numShards = kinesisClient.describeStream(streamName)
+                .getStreamDescription().getShards().size();
+
+        /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ 
+        int numStreams = numShards;
+
+        /* Must add 1 more thread than the number of receivers or the output won't show properly from the driver */
+        int numSparkThreads = numStreams + 1;
+
+        /* Setup the Spark config. */
+        SparkConf sparkConfig = new SparkConf().setAppName("KinesisWordCount").setMaster(
+                "local[" + numSparkThreads + "]");
+
+        /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
+        Duration checkpointInterval = batchInterval;
+
+        /* Setup the StreamingContext */
+        JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);
+
+        /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+        List<JavaDStream<byte[]>> streamsList = new ArrayList<JavaDStream<byte[]>>(numStreams);
+        for (int i = 0; i < numStreams; i++) {
+        	streamsList.add(
+                KinesisUtils.createStream(jssc, streamName, endpointUrl, checkpointInterval, 
+                InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2())
+            );
+        }
+
+        /* Union all the streams if there is more than 1 stream */
+        JavaDStream<byte[]> unionStreams;
+        if (streamsList.size() > 1) {
+            unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
+        } else {
+            /* Otherwise, just use the 1 stream */
+            unionStreams = streamsList.get(0);
+        }
+
+        /*
+         * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection.
+         * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR.
+         */
+        JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
+                @Override
+                public Iterable<String> call(byte[] line) {
+                    return Lists.newArrayList(WORD_SEPARATOR.split(new String(line)));
+                }
+            });
+
+        /* Map each word to a (word, 1) tuple, then reduce/aggregate by word. */
+        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
+            new PairFunction<String, String, Integer>() {
+                @Override
+                public Tuple2<String, Integer> call(String s) {
+                    return new Tuple2<String, Integer>(s, 1);
+                }
+            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
+                @Override
+                public Integer call(Integer i1, Integer i2) {
+                  return i1 + i2;
+                }
+            });
+
+        /* Print the first 10 wordCounts */
+        wordCounts.print();
+
+        /* Start the streaming context and await termination */
+        jssc.start();
+        jssc.awaitTermination();
+    }
+}
diff --git a/extras/kinesis-asl/src/main/resources/log4j.properties b/extras/kinesis-asl/src/main/resources/log4j.properties
new file mode 100644
index 0000000000000..97348fb5b6123
--- /dev/null
+++ b/extras/kinesis-asl/src/main/resources/log4j.properties
@@ -0,0 +1,37 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+log4j.rootCategory=WARN, console
+
+# File appender
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=false
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
+
+# Console appender
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.out
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
\ No newline at end of file
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
new file mode 100644
index 0000000000000..d03edf8b30a9f
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming
+
+import java.nio.ByteBuffer
+import scala.util.Random
+import org.apache.spark.Logging
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Milliseconds
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions
+import org.apache.spark.streaming.kinesis.KinesisUtils
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+import com.amazonaws.services.kinesis.AmazonKinesisClient
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.model.PutRecordRequest
+import org.apache.log4j.Logger
+import org.apache.log4j.Level
+
+/**
+ * Kinesis Spark Streaming WordCount example.
+ *
+ * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details on
+ *   the Kinesis Spark Streaming integration.
+ *
+ * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard 
+ *   for the given stream.
+ * It then starts pulling from the last checkpointed sequence number of the given 
+ *   <stream-name> and <endpoint-url>. 
+ *
+ * Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region
+ * 
+ * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials
+ *   in the following order of precedence:
+ * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+ * Java System Properties - aws.accessKeyId and aws.secretKey
+ * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+ * Instance profile credentials - delivered through the Amazon EC2 metadata service
+ *
+ * Usage: KinesisWordCountASL <stream-name> <endpoint-url>
+ *   <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
+ *   <endpoint-url> is the endpoint of the Kinesis service
+ *     (ie. https://kinesis.us-east-1.amazonaws.com)
+ *
+ * Example:
+ *    $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *    $ export AWS_SECRET_KEY=<your-secret-key>
+ *    $ $SPARK_HOME/bin/run-example \
+ *        org.apache.spark.examples.streaming.KinesisWordCountASL mySparkStream \
+ *        https://kinesis.us-east-1.amazonaws.com
+ *
+ * There is a companion helper class below called KinesisWordCountProducerASL which puts
+ *   dummy data onto the Kinesis stream.
+ * Usage instructions for KinesisWordCountProducerASL are provided in that class definition.
+ */
+object KinesisWordCountASL extends Logging {
+  def main(args: Array[String]) {
+    /* Check that all required args were passed in. */
+    if (args.length < 2) {
+      System.err.println(
+        """
+          |Usage: KinesisWordCount <stream-name> <endpoint-url>
+          |    <stream-name> is the name of the Kinesis stream
+          |    <endpoint-url> is the endpoint of the Kinesis service
+          |                   (e.g. https://kinesis.us-east-1.amazonaws.com)
+        """.stripMargin)
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    /* Populate the appropriate variables from the given args */
+    val Array(streamName, endpointUrl) = args
+
+    /* Determine the number of shards from the stream */
+    val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
+    kinesisClient.setEndpoint(endpointUrl)
+    val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards()
+      .size()
+
+    /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */
+    val numStreams = numShards
+
+    /* 
+     *  numSparkThreads should be 1 more thread than the number of receivers.
+     *  This leaves one thread available for actually processing the data.
+     */
+    val numSparkThreads = numStreams + 1
+
+    /* Setup the and SparkConfig and StreamingContext */
+    /* Spark Streaming batch interval */
+    val batchInterval = Milliseconds(2000)    
+    val sparkConfig = new SparkConf().setAppName("KinesisWordCount")
+      .setMaster(s"local[$numSparkThreads]")
+    val ssc = new StreamingContext(sparkConfig, batchInterval)
+
+    /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
+    val kinesisCheckpointInterval = batchInterval
+
+    /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+    val kinesisStreams = (0 until numStreams).map { i =>
+      KinesisUtils.createStream(ssc, streamName, endpointUrl, kinesisCheckpointInterval,
+          InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
+    }
+
+    /* Union all the streams */
+    val unionStreams = ssc.union(kinesisStreams)
+
+    /* Convert each line of Array[Byte] to String, split into words, and count them */
+    val words = unionStreams.flatMap(byteArray => new String(byteArray)
+      .split(" "))
+
+    /* Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */
+    val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
+
+    /* Print the first 10 wordCounts */
+    wordCounts.print()
+
+    /* Start the streaming context and await termination */
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
+
+/**
+ * Usage: KinesisWordCountProducerASL <stream-name> <kinesis-endpoint-url>
+ *     <recordsPerSec> <wordsPerRecord>
+ *   <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
+ *   <kinesis-endpoint-url> is the endpoint of the Kinesis service
+ *     (ie. https://kinesis.us-east-1.amazonaws.com)
+ *   <records-per-sec> is the rate of records per second to put onto the stream
+ *   <words-per-record> is the rate of records per second to put onto the stream
+ *
+ * Example:
+ *    $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *    $ export AWS_SECRET_KEY=<your-secret-key>
+ *    $ $SPARK_HOME/bin/run-example \
+ *         org.apache.spark.examples.streaming.KinesisWordCountProducerASL mySparkStream \
+ *         https://kinesis.us-east-1.amazonaws.com 10 5
+ */
+object KinesisWordCountProducerASL {
+  def main(args: Array[String]) {
+    if (args.length < 4) {
+      System.err.println("Usage: KinesisWordCountProducerASL <stream-name> <endpoint-url>" +
+          " <records-per-sec> <words-per-record>")
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    /* Populate the appropriate variables from the given args */
+    val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args
+
+    /* Generate the records and return the totals */
+    val totals = generate(stream, endpoint, recordsPerSecond.toInt, wordsPerRecord.toInt)
+
+    /* Print the array of (index, total) tuples */
+    println("Totals")
+    totals.foreach(total => println(total.toString()))
+  }
+
+  def generate(stream: String,
+      endpoint: String,
+      recordsPerSecond: Int,
+      wordsPerRecord: Int): Seq[(Int, Int)] = {
+
+    val MaxRandomInts = 10
+
+    /* Create the Kinesis client */
+    val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
+    kinesisClient.setEndpoint(endpoint)
+
+    println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" +
+      s" $recordsPerSecond records per second and $wordsPerRecord words per record");
+
+    val totals = new Array[Int](MaxRandomInts)
+    /* Put String records onto the stream per the given recordPerSec and wordsPerRecord */
+    for (i <- 1 to 5) {
+
+      /* Generate recordsPerSec records to put onto the stream */
+      val records = (1 to recordsPerSecond.toInt).map { recordNum =>
+        /* 
+         *  Randomly generate each wordsPerRec words between 0 (inclusive)
+         *  and MAX_RANDOM_INTS (exclusive) 
+         */
+        val data = (1 to wordsPerRecord.toInt).map(x => {
+          /* Generate the random int */
+          val randomInt = Random.nextInt(MaxRandomInts)
+
+          /* Keep track of the totals */
+          totals(randomInt) += 1
+
+          randomInt.toString()
+        }).mkString(" ")
+
+        /* Create a partitionKey based on recordNum */
+        val partitionKey = s"partitionKey-$recordNum"
+
+        /* Create a PutRecordRequest with an Array[Byte] version of the data */
+        val putRecordRequest = new PutRecordRequest().withStreamName(stream)
+            .withPartitionKey(partitionKey)
+            .withData(ByteBuffer.wrap(data.getBytes()));
+
+        /* Put the record onto the stream and capture the PutRecordResult */
+        val putRecordResult = kinesisClient.putRecord(putRecordRequest);
+      }
+
+      /* Sleep for a second */
+      Thread.sleep(1000)
+      println("Sent " + recordsPerSecond + " records")
+    }
+
+    /* Convert the totals to (index, total) tuple */
+    (0 to (MaxRandomInts - 1)).zip(totals)
+  }
+}
+
+/** 
+ *  Utility functions for Spark Streaming examples. 
+ *  This has been lifted from the examples/ project to remove the circular dependency.
+ */
+object StreamingExamples extends Logging {
+
+  /** Set reasonable logging levels for streaming if the user has not configured log4j. */
+  def setStreamingLogLevels() {
+    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
+    if (!log4jInitialized) {
+      // We first log something to initialize Spark's default logging, then we override the
+      // logging level.
+      logInfo("Setting log level to [WARN] for streaming example." +
+        " To override add a custom log4j.properties to the classpath.")
+      Logger.getRootLogger.setLevel(Level.WARN)
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
new file mode 100644
index 0000000000000..0b80b611cdce7
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.Logging
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.util.Clock
+import org.apache.spark.streaming.util.ManualClock
+import org.apache.spark.streaming.util.SystemClock
+
+/**
+ * This is a helper class for managing checkpoint clocks.
+ *
+ * @param checkpointInterval 
+ * @param currentClock.  Default to current SystemClock if none is passed in (mocking purposes)
+ */
+private[kinesis] class KinesisCheckpointState(
+    checkpointInterval: Duration, 
+    currentClock: Clock = new SystemClock())
+  extends Logging {
+  
+  /* Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */
+  val checkpointClock = new ManualClock()
+  checkpointClock.setTime(currentClock.currentTime() + checkpointInterval.milliseconds)
+
+  /**
+   * Check if it's time to checkpoint based on the current time and the derived time 
+   *   for the next checkpoint
+   *
+   * @return true if it's time to checkpoint
+   */
+  def shouldCheckpoint(): Boolean = {
+    new SystemClock().currentTime() > checkpointClock.currentTime()
+  }
+
+  /**
+   * Advance the checkpoint clock by the checkpoint interval.
+   */
+  def advanceCheckpoint() = {
+    checkpointClock.addToTime(checkpointInterval.milliseconds)
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
new file mode 100644
index 0000000000000..1bd1f324298e7
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.net.InetAddress
+import java.util.UUID
+
+import org.apache.spark.Logging
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.receiver.Receiver
+
+import com.amazonaws.auth.AWSCredentialsProvider
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker
+
+/**
+ * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver.
+ * This implementation relies on the Kinesis Client Library (KCL) Worker as described here:
+ * https://github.com/awslabs/amazon-kinesis-client
+ * This is a custom receiver used with StreamingContext.receiverStream(Receiver) 
+ *   as described here:
+ *     http://spark.apache.org/docs/latest/streaming-custom-receivers.html
+ * Instances of this class will get shipped to the Spark Streaming Workers 
+ *   to run within a Spark Executor.
+ *
+ * @param appName  Kinesis application name. Kinesis Apps are mapped to Kinesis Streams
+ *                 by the Kinesis Client Library.  If you change the App name or Stream name,
+ *                 the KCL will throw errors.  This usually requires deleting the backing  
+ *                 DynamoDB table with the same name this Kinesis application.
+ * @param streamName   Kinesis stream name
+ * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+ * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+ *                            See the Kinesis Spark Streaming documentation for more
+ *                            details on the different types of checkpoints.
+ * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+ *                                 worker's initial starting position in the stream.
+ *                                 The values are either the beginning of the stream
+ *                                 per Kinesis' limit of 24 hours
+ *                                 (InitialPositionInStream.TRIM_HORIZON) or
+ *                                 the tip of the stream (InitialPositionInStream.LATEST).
+ * @param storageLevel Storage level to use for storing the received objects
+ *
+ * @return ReceiverInputDStream[Array[Byte]]   
+ */
+private[kinesis] class KinesisReceiver(
+    appName: String,
+    streamName: String,
+    endpointUrl: String,
+    checkpointInterval: Duration,
+    initialPositionInStream: InitialPositionInStream,
+    storageLevel: StorageLevel)
+  extends Receiver[Array[Byte]](storageLevel) with Logging { receiver =>
+
+  /*
+   * The following vars are built in the onStart() method which executes in the Spark Worker after
+   *   this code is serialized and shipped remotely.
+   */
+
+  /*
+   *  workerId should be based on the ip address of the actual Spark Worker where this code runs
+   *   (not the Driver's ip address.)
+   */
+  var workerId: String = null
+
+  /*
+   * This impl uses the DefaultAWSCredentialsProviderChain and searches for credentials 
+   *   in the following order of precedence:
+   * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+   * Java System Properties - aws.accessKeyId and aws.secretKey
+   * Credential profiles file at the default location (~/.aws/credentials) shared by all 
+   *   AWS SDKs and the AWS CLI
+   * Instance profile credentials delivered through the Amazon EC2 metadata service
+   */
+  var credentialsProvider: AWSCredentialsProvider = null
+
+  /* KCL config instance. */
+  var kinesisClientLibConfiguration: KinesisClientLibConfiguration = null
+
+  /*
+   *  RecordProcessorFactory creates impls of IRecordProcessor.
+   *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the 
+   *    IRecordProcessor.processRecords() method.
+   *  We're using our custom KinesisRecordProcessor in this case.
+   */
+  var recordProcessorFactory: IRecordProcessorFactory = null
+
+  /*
+   * Create a Kinesis Worker.
+   * This is the core client abstraction from the Kinesis Client Library (KCL).
+   * We pass the RecordProcessorFactory from above as well as the KCL config instance.
+   * A Kinesis Worker can process 1..* shards from the given stream - each with its 
+   *   own RecordProcessor.
+   */
+  var worker: Worker = null
+
+  /**
+   *  This is called when the KinesisReceiver starts and must be non-blocking.
+   *  The KCL creates and manages the receiving/processing thread pool through the Worker.run() 
+   *    method.
+   */
+  override def onStart() {
+    workerId = InetAddress.getLocalHost.getHostAddress() + ":" + UUID.randomUUID()
+    credentialsProvider = new DefaultAWSCredentialsProviderChain()
+    kinesisClientLibConfiguration = new KinesisClientLibConfiguration(appName, streamName,
+      credentialsProvider, workerId).withKinesisEndpoint(endpointUrl)
+      .withInitialPositionInStream(initialPositionInStream).withTaskBackoffTimeMillis(500)
+    recordProcessorFactory = new IRecordProcessorFactory {
+      override def createProcessor: IRecordProcessor = new KinesisRecordProcessor(receiver,
+        workerId, new KinesisCheckpointState(checkpointInterval))
+    }
+    worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration)
+    worker.run()
+    logInfo(s"Started receiver with workerId $workerId")
+  }
+
+  /**
+   *  This is called when the KinesisReceiver stops.
+   *  The KCL worker.shutdown() method stops the receiving/processing threads.
+   *  The KCL will do its best to drain and checkpoint any in-flight records upon shutdown.
+   */
+  override def onStop() {
+    worker.shutdown()
+    logInfo(s"Shut down receiver with workerId $workerId")
+    workerId = null
+    credentialsProvider = null
+    kinesisClientLibConfiguration = null
+    recordProcessorFactory = null
+    worker = null
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
new file mode 100644
index 0000000000000..8ecc2d90160b1
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.util.List
+
+import scala.collection.JavaConversions.asScalaBuffer
+import scala.util.Random
+
+import org.apache.spark.Logging
+
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
+import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
+import com.amazonaws.services.kinesis.model.Record
+
+/**
+ * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
+ * This implementation operates on the Array[Byte] from the KinesisReceiver.
+ * The Kinesis Worker creates an instance of this KinesisRecordProcessor upon startup.
+ *
+ * @param receiver Kinesis receiver
+ * @param workerId for logging purposes
+ * @param checkpointState represents the checkpoint state including the next checkpoint time.
+ *   It's injected here for mocking purposes.
+ */
+private[kinesis] class KinesisRecordProcessor(
+    receiver: KinesisReceiver,
+    workerId: String,
+    checkpointState: KinesisCheckpointState) extends IRecordProcessor with Logging {
+
+  /* shardId to be populated during initialize() */
+  var shardId: String = _
+
+  /**
+   * The Kinesis Client Library calls this method during IRecordProcessor initialization.
+   *
+   * @param shardId assigned by the KCL to this particular RecordProcessor.
+   */
+  override def initialize(shardId: String) {
+    logInfo(s"Initialize:  Initializing workerId $workerId with shardId $shardId")
+    this.shardId = shardId
+  }
+
+  /**
+   * This method is called by the KCL when a batch of records is pulled from the Kinesis stream.
+   * This is the record-processing bridge between the KCL's IRecordProcessor.processRecords()
+   * and Spark Streaming's Receiver.store().
+   *
+   * @param batch list of records from the Kinesis stream shard
+   * @param checkpointer used to update Kinesis when this batch has been processed/stored 
+   *   in the DStream
+   */
+  override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) {
+    if (!receiver.isStopped()) {
+      try {
+        /*
+         * Note:  If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming
+         * Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the
+         *   internally-configured Spark serializer (kryo, etc).
+         * This is not desirable, so we instead store a raw Array[Byte] and decouple
+         *   ourselves from Spark's internal serialization strategy.
+         */
+        batch.foreach(record => receiver.store(record.getData().array()))
+        
+        logDebug(s"Stored:  Worker $workerId stored ${batch.size} records for shardId $shardId")
+
+        /*
+         * Checkpoint the sequence number of the last record successfully processed/stored 
+         *   in the batch.
+         * In this implementation, we're checkpointing after the given checkpointIntervalMillis.
+         * Note that this logic requires that processRecords() be called AND that it's time to 
+         *   checkpoint.  I point this out because there is no background thread running the 
+         *   checkpointer.  Checkpointing is tested and trigger only when a new batch comes in.
+         * If the worker is shutdown cleanly, checkpoint will happen (see shutdown() below).
+         * However, if the worker dies unexpectedly, a checkpoint may not happen.
+         * This could lead to records being processed more than once.
+         */
+        if (checkpointState.shouldCheckpoint()) {
+          /* Perform the checkpoint */
+          KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100)
+
+          /* Update the next checkpoint time */
+          checkpointState.advanceCheckpoint()
+
+          logDebug(s"Checkpoint:  WorkerId $workerId completed checkpoint of ${batch.size}" +
+              s" records for shardId $shardId")
+          logDebug(s"Checkpoint:  Next checkpoint is at " +
+              s" ${checkpointState.checkpointClock.currentTime()} for shardId $shardId")
+        }
+      } catch {
+        case e: Throwable => {
+          /*
+           *  If there is a failure within the batch, the batch will not be checkpointed.
+           *  This will potentially cause records since the last checkpoint to be processed
+           *     more than once.
+           */
+          logError(s"Exception:  WorkerId $workerId encountered and exception while storing " +
+              " or checkpointing a batch for workerId $workerId and shardId $shardId.", e)
+
+          /* Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor.*/
+          throw e
+        }
+      }
+    } else {
+      /* RecordProcessor has been stopped. */
+      logInfo(s"Stopped:  The Spark KinesisReceiver has stopped for workerId $workerId" + 
+          s" and shardId $shardId.  No more records will be processed.")
+    }
+  }
+
+  /**
+   * Kinesis Client Library is shutting down this Worker for 1 of 2 reasons:
+   * 1) the stream is resharding by splitting or merging adjacent shards 
+   *     (ShutdownReason.TERMINATE)
+   * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason 
+   *     (ShutdownReason.ZOMBIE)
+   *
+   * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE
+   * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE)
+   */
+  override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) {
+    logInfo(s"Shutdown:  Shutting down workerId $workerId with reason $reason")
+    reason match {
+      /*
+       * TERMINATE Use Case.  Checkpoint.
+       * Checkpoint to indicate that all records from the shard have been drained and processed.
+       * It's now OK to read from the new shards that resulted from a resharding event.
+       */
+      case ShutdownReason.TERMINATE => 
+        KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100)
+
+      /*
+       * ZOMBIE Use Case.  NoOp.
+       * No checkpoint because other workers may have taken over and already started processing
+       *    the same records.
+       * This may lead to records being processed more than once.
+       */
+      case ShutdownReason.ZOMBIE =>
+
+      /* Unknown reason.  NoOp */
+      case _ =>
+    }
+  }
+}
+
+private[kinesis] object KinesisRecordProcessor extends Logging {
+  /**
+   * Retry the given amount of times with a random backoff time (millis) less than the
+   *   given maxBackOffMillis
+   *
+   * @param expression expression to evalute
+   * @param numRetriesLeft number of retries left
+   * @param maxBackOffMillis: max millis between retries
+   *
+   * @return evaluation of the given expression
+   * @throws Unretryable exception, unexpected exception,
+   *  or any exception that persists after numRetriesLeft reaches 0
+   */
+  @annotation.tailrec
+  def retryRandom[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = {
+    util.Try { expression } match {
+      /* If the function succeeded, evaluate to x. */
+      case util.Success(x) => x
+      /* If the function failed, either retry or throw the exception */
+      case util.Failure(e) => e match {
+        /* Retry:  Throttling or other Retryable exception has occurred */
+        case _: ThrottlingException | _: KinesisClientLibDependencyException if numRetriesLeft > 1
+          => {
+               val backOffMillis = Random.nextInt(maxBackOffMillis)
+               Thread.sleep(backOffMillis)
+               logError(s"Retryable Exception:  Random backOffMillis=${backOffMillis}", e)
+               retryRandom(expression, numRetriesLeft - 1, maxBackOffMillis)
+             }
+        /* Throw:  Shutdown has been requested by the Kinesis Client Library.*/
+        case _: ShutdownException => {
+          logError(s"ShutdownException:  Caught shutdown exception, skipping checkpoint.", e)
+          throw e
+        }
+        /* Throw:  Non-retryable exception has occurred with the Kinesis Client Library */
+        case _: InvalidStateException => {
+          logError(s"InvalidStateException:  Cannot save checkpoint to the DynamoDB table used" +
+              s" by the Amazon Kinesis Client Library.  Table likely doesn't exist.", e)
+          throw e
+        }
+        /* Throw:  Unexpected exception has occurred */
+        case _ => {
+          logError(s"Unexpected, non-retryable exception.", e)
+          throw e
+        }
+      }
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
new file mode 100644
index 0000000000000..713cac0e293c0
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.api.java.JavaReceiverInputDStream
+import org.apache.spark.streaming.api.java.JavaStreamingContext
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
+
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
+
+/**
+ * Helper class to create Amazon Kinesis Input Stream
+ * :: Experimental ::
+ */
+@Experimental
+object KinesisUtils {
+  /**
+   * Create an InputDStream that pulls messages from a Kinesis stream.
+   *
+   * @param ssc    StreamingContext object
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects
+   *
+   * @return ReceiverInputDStream[Array[Byte]]
+   */
+  def createStream(
+      ssc: StreamingContext,
+      streamName: String,
+      endpointUrl: String,
+      checkpointInterval: Duration,
+      initialPositionInStream: InitialPositionInStream,
+      storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = {
+    ssc.receiverStream(new KinesisReceiver(ssc.sc.appName, streamName, endpointUrl,
+        checkpointInterval, initialPositionInStream, storageLevel))
+  }
+
+  /**
+   * Create a Java-friendly InputDStream that pulls messages from a Kinesis stream.
+   *
+   * @param jssc Java StreamingContext object
+   * @param ssc    StreamingContext object
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects
+   *
+   * @return JavaReceiverInputDStream[Array[Byte]]
+   */
+  def createStream(
+      jssc: JavaStreamingContext, 
+      streamName: String, 
+      endpointUrl: String, 
+      checkpointInterval: Duration,
+      initialPositionInStream: InitialPositionInStream,
+      storageLevel: StorageLevel): JavaReceiverInputDStream[Array[Byte]] = {
+    jssc.receiverStream(new KinesisReceiver(jssc.ssc.sc.appName, streamName,
+        endpointUrl, checkpointInterval, initialPositionInStream, storageLevel))
+  }
+}
diff --git a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
new file mode 100644
index 0000000000000..87954a31f60ce
--- /dev/null
+++ b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis;
+
+import org.apache.spark.storage.StorageLevel;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.LocalJavaStreamingContext;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.junit.Test;
+
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+
+/**
+ * Demonstrate the use of the KinesisUtils Java API
+ */
+public class JavaKinesisStreamSuite extends LocalJavaStreamingContext {
+  @Test
+  public void testKinesisStream() {
+    // Tests the API, does not actually test data receiving
+    JavaDStream<byte[]> kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream",
+        "https://kinesis.us-west-2.amazonaws.com", new Duration(2000), 
+        InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2());
+    
+    ssc.stop();
+  }
+}
diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..e01e049595475
--- /dev/null
+++ b/extras/kinesis-asl/src/test/resources/log4j.properties
@@ -0,0 +1,26 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+log4j.rootCategory=INFO, file
+# log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=false
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.eclipse.jetty=WARN
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
new file mode 100644
index 0000000000000..41dbd64c2b1fa
--- /dev/null
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.nio.ByteBuffer
+
+import scala.collection.JavaConversions.seqAsJavaList
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Milliseconds
+import org.apache.spark.streaming.Seconds
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.TestSuiteBase
+import org.apache.spark.streaming.util.Clock
+import org.apache.spark.streaming.util.ManualClock
+import org.scalatest.BeforeAndAfter
+import org.scalatest.Matchers
+import org.scalatest.mock.EasyMockSugar
+
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
+import com.amazonaws.services.kinesis.model.Record
+
+/**
+ *  Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor 
+ */
+class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter
+    with EasyMockSugar {
+
+  val app = "TestKinesisReceiver"
+  val stream = "mySparkStream"
+  val endpoint = "endpoint-url"
+  val workerId = "dummyWorkerId"
+  val shardId = "dummyShardId"
+
+  val record1 = new Record()
+  record1.setData(ByteBuffer.wrap("Spark In Action".getBytes()))
+  val record2 = new Record()
+  record2.setData(ByteBuffer.wrap("Learning Spark".getBytes()))
+  val batch = List[Record](record1, record2)
+
+  var receiverMock: KinesisReceiver = _
+  var checkpointerMock: IRecordProcessorCheckpointer = _
+  var checkpointClockMock: ManualClock = _
+  var checkpointStateMock: KinesisCheckpointState = _
+  var currentClockMock: Clock = _
+
+  override def beforeFunction() = {
+    receiverMock = mock[KinesisReceiver]
+    checkpointerMock = mock[IRecordProcessorCheckpointer]
+    checkpointClockMock = mock[ManualClock]
+    checkpointStateMock = mock[KinesisCheckpointState]
+    currentClockMock = mock[Clock]
+  }
+
+  test("kinesis utils api") {
+    val ssc = new StreamingContext(master, framework, batchDuration)
+    // Tests the API, does not actually test data receiving
+    val kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", Seconds(2),
+      InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2);
+    ssc.stop()
+  }
+
+  test("process records including store and checkpoint") {
+    val expectedCheckpointIntervalMillis = 10
+    expecting {
+      receiverMock.isStopped().andReturn(false).once()
+      receiverMock.store(record1.getData().array()).once()
+      receiverMock.store(record2.getData().array()).once()
+      checkpointStateMock.shouldCheckpoint().andReturn(true).once()
+      checkpointerMock.checkpoint().once()
+      checkpointStateMock.advanceCheckpoint().once()
+    }
+    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
+          checkpointStateMock)
+      recordProcessor.processRecords(batch, checkpointerMock)
+    }
+  }
+
+  test("shouldn't store and checkpoint when receiver is stopped") {
+    expecting {
+      receiverMock.isStopped().andReturn(true).once()
+    }
+    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
+          checkpointStateMock)
+      recordProcessor.processRecords(batch, checkpointerMock)
+    }
+  }
+
+  test("shouldn't checkpoint when exception occurs during store") {
+    expecting {
+      receiverMock.isStopped().andReturn(false).once()
+      receiverMock.store(record1.getData().array()).andThrow(new RuntimeException()).once()
+    }
+    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
+      intercept[RuntimeException] {
+        val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
+            checkpointStateMock)
+        recordProcessor.processRecords(batch, checkpointerMock)
+      }
+    }
+  }
+
+  test("should set checkpoint time to currentTime + checkpoint interval upon instantiation") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+    val checkpointIntervalMillis = 10
+    val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
+    assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+    }
+  }
+
+  test("should checkpoint if we have exceeded the checkpoint interval") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+      val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MinValue), currentClockMock)
+      assert(checkpointState.shouldCheckpoint())
+    }
+  }
+
+  test("shouldn't checkpoint if we have not exceeded the checkpoint interval") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+      val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MaxValue), currentClockMock)
+      assert(!checkpointState.shouldCheckpoint())
+    }
+  }
+
+  test("should add to time when advancing checkpoint") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+      val checkpointIntervalMillis = 10
+      val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
+      assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+      checkpointState.advanceCheckpoint()
+      assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis))
+    }
+  }
+
+  test("shutdown should checkpoint if the reason is TERMINATE") {
+    expecting {
+      checkpointerMock.checkpoint().once()
+    }
+    whenExecuting(checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, 
+          checkpointStateMock)
+      val reason = ShutdownReason.TERMINATE
+      recordProcessor.shutdown(checkpointerMock, reason)
+    }
+  }
+
+  test("shutdown should not checkpoint if the reason is something other than TERMINATE") {
+    expecting {
+    }
+    whenExecuting(checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, 
+          checkpointStateMock)
+      recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE)
+      recordProcessor.shutdown(checkpointerMock, null)
+    }
+  }
+
+  test("retry success on first attempt") {
+    val expectedIsStopped = false
+    expecting {
+      receiverMock.isStopped().andReturn(expectedIsStopped).once()
+    }
+    whenExecuting(receiverMock) {
+      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+      assert(actualVal == expectedIsStopped)
+    }
+  }
+
+  test("retry success on second attempt after a Kinesis throttling exception") {
+    val expectedIsStopped = false
+    expecting {
+      receiverMock.isStopped().andThrow(new ThrottlingException("error message"))
+        .andReturn(expectedIsStopped).once()
+    }
+    whenExecuting(receiverMock) {
+      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+      assert(actualVal == expectedIsStopped)
+    }
+  }
+
+  test("retry success on second attempt after a Kinesis dependency exception") {
+    val expectedIsStopped = false
+    expecting {
+      receiverMock.isStopped().andThrow(new KinesisClientLibDependencyException("error message"))
+        .andReturn(expectedIsStopped).once()
+    }
+    whenExecuting(receiverMock) {
+      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+      assert(actualVal == expectedIsStopped)
+    }
+  }
+
+  test("retry failed after a shutdown exception") {
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new ShutdownException("error message")).once()
+    }
+    whenExecuting(checkpointerMock) {
+      intercept[ShutdownException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+    }
+  }
+
+  test("retry failed after an invalid state exception") {
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new InvalidStateException("error message")).once()
+    }
+    whenExecuting(checkpointerMock) {
+      intercept[InvalidStateException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+    }
+  }
+
+  test("retry failed after unexpected exception") {
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new RuntimeException("error message")).once()
+    }
+    whenExecuting(checkpointerMock) {
+      intercept[RuntimeException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+    }
+  }
+
+  test("retry failed after exhausing all retries") {
+    val expectedErrorMessage = "final try error message"
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new ThrottlingException("error message"))
+        .andThrow(new ThrottlingException(expectedErrorMessage)).once()
+    }
+    whenExecuting(checkpointerMock) {
+      val exception = intercept[RuntimeException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+      exception.getMessage().shouldBe(expectedErrorMessage)
+    }
+  }
+}
diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml
index 22ea330b4374d..a5b162a0482e4 100644
--- a/extras/spark-ganglia-lgpl/pom.xml
+++ b/extras/spark-ganglia-lgpl/pom.xml
@@ -29,7 +29,11 @@
   <artifactId>spark-ganglia-lgpl_2.10</artifactId>
   <packaging>jar</packaging>
   <name>Spark Ganglia Integration</name>
-  
+
+  <properties>
+    <sbt.project.name>ganglia-lgpl</sbt.project.name>
+  </properties>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala b/extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala
index d03d7774e8c80..3b1880e143513 100644
--- a/extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala
+++ b/extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala
@@ -82,5 +82,9 @@ class GangliaSink(val property: Properties, val registry: MetricRegistry,
   override def stop() {
     reporter.stop()
   }
+
+  override def report() {
+    reporter.report()
+  }
 }
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 7d5d83e7f3bb9..6dd52fc618b1e 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-graphx_2.10</artifactId>
+  <properties>
+    <sbt.project.name>graphx</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project GraphX</name>
   <url>http://spark.apache.org/</url>
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index 4db45c9af8fae..fa4b891754c40 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -107,14 +107,16 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
   /**
    * Repartitions the edges in the graph according to `partitionStrategy`.
    *
-   * @param the partitioning strategy to use when partitioning the edges in the graph.
+   * @param partitionStrategy the partitioning strategy to use when partitioning the edges
+   * in the graph.
    */
   def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
 
   /**
    * Repartitions the edges in the graph according to `partitionStrategy`.
    *
-   * @param the partitioning strategy to use when partitioning the edges in the graph.
+   * @param partitionStrategy the partitioning strategy to use when partitioning the edges
+   * in the graph.
    * @param numPartitions the number of edge partitions in the new graph.
    */
   def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): Graph[VD, ED]
@@ -342,7 +344,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    *
    * {{{
    * val rawGraph: Graph[_, _] = Graph.textFile("webgraph")
-   * val outDeg: RDD[(VertexId, Int)] = rawGraph.outDegrees()
+   * val outDeg: RDD[(VertexId, Int)] = rawGraph.outDegrees
    * val graph = rawGraph.outerJoinVertices(outDeg) {
    *   (vid, data, optDeg) => optDeg.getOrElse(0)
    * }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala
index f97f329c0e832..1948c978c30bf 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala
@@ -35,9 +35,6 @@ class GraphKryoRegistrator extends KryoRegistrator {
 
   def registerClasses(kryo: Kryo) {
     kryo.register(classOf[Edge[Object]])
-    kryo.register(classOf[MessageToPartition[Object]])
-    kryo.register(classOf[VertexBroadcastMsg[Object]])
-    kryo.register(classOf[RoutingTableMessage])
     kryo.register(classOf[(VertexId, Object)])
     kryo.register(classOf[EdgePartition[Object, Object]])
     kryo.register(classOf[BitSet])
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
index edd5b79da1522..02afaa987d40d 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
@@ -198,10 +198,10 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    *
    * {{{
    * val rawGraph: Graph[Int, Int] = GraphLoader.edgeListFile(sc, "webgraph")
-   *   .mapVertices(v => 0)
-   * val outDeg: RDD[(Int, Int)] = rawGraph.outDegrees
-   * val graph = rawGraph.leftJoinVertices[Int,Int](outDeg,
-   *   (v, deg) => deg )
+   *   .mapVertices((_, _) => 0)
+   * val outDeg = rawGraph.outDegrees
+   * val graph = rawGraph.joinVertices[Int](outDeg)
+   *   ((_, _, outDeg) => outDeg)
    * }}}
    *
    */
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
index f1b6df9a3025e..4825d12fc27b3 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
@@ -182,8 +182,8 @@ class VertexRDD[@specialized VD: ClassTag](
   /**
    * Left joins this RDD with another VertexRDD with the same index. This function will fail if
    * both VertexRDDs do not share the same index. The resulting vertex set contains an entry for
-   * each
-   * vertex in `this`. If `other` is missing any vertex in this VertexRDD, `f` is passed `None`.
+   * each vertex in `this`.
+   * If `other` is missing any vertex in this VertexRDD, `f` is passed `None`.
    *
    * @tparam VD2 the attribute type of the other VertexRDD
    * @tparam VD3 the attribute type of the resulting VertexRDD
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index ccdaa82eb9162..33f35cfb69a26 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -26,7 +26,6 @@ import org.apache.spark.storage.StorageLevel
 
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl._
-import org.apache.spark.graphx.impl.MsgRDDFunctions._
 import org.apache.spark.graphx.util.BytecodeUtils
 
 
@@ -83,15 +82,13 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
     val vdTag = classTag[VD]
     val newEdges = edges.withPartitionsRDD(edges.map { e =>
       val part: PartitionID = partitionStrategy.getPartition(e.srcId, e.dstId, numPartitions)
-
-      // Should we be using 3-tuple or an optimized class
-      new MessageToPartition(part, (e.srcId, e.dstId, e.attr))
+      (part, (e.srcId, e.dstId, e.attr))
     }
       .partitionBy(new HashPartitioner(numPartitions))
       .mapPartitionsWithIndex( { (pid, iter) =>
         val builder = new EdgePartitionBuilder[ED, VD]()(edTag, vdTag)
         iter.foreach { message =>
-          val data = message.data
+          val data = message._2
           builder.add(data._1, data._2, data._3)
         }
         val edgePartition = builder.toEdgePartition
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala
index d85afa45b1264..714f3b81c9dad 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala
@@ -25,86 +25,10 @@ import org.apache.spark.graphx.{PartitionID, VertexId}
 import org.apache.spark.rdd.{ShuffledRDD, RDD}
 
 
-private[graphx]
-class VertexBroadcastMsg[@specialized(Int, Long, Double, Boolean) T](
-    @transient var partition: PartitionID,
-    var vid: VertexId,
-    var data: T)
-  extends Product2[PartitionID, (VertexId, T)] with Serializable {
-
-  override def _1 = partition
-
-  override def _2 = (vid, data)
-
-  override def canEqual(that: Any): Boolean = that.isInstanceOf[VertexBroadcastMsg[_]]
-}
-
-
-/**
- * A message used to send a specific value to a partition.
- * @param partition index of the target partition.
- * @param data value to send
- */
-private[graphx]
-class MessageToPartition[@specialized(Int, Long, Double, Char, Boolean/* , AnyRef */) T](
-    @transient var partition: PartitionID,
-    var data: T)
-  extends Product2[PartitionID, T] with Serializable {
-
-  override def _1 = partition
-
-  override def _2 = data
-
-  override def canEqual(that: Any): Boolean = that.isInstanceOf[MessageToPartition[_]]
-}
-
-
-private[graphx]
-class VertexBroadcastMsgRDDFunctions[T: ClassTag](self: RDD[VertexBroadcastMsg[T]]) {
-  def partitionBy(partitioner: Partitioner): RDD[VertexBroadcastMsg[T]] = {
-    val rdd = new ShuffledRDD[PartitionID, (VertexId, T), (VertexId, T), VertexBroadcastMsg[T]](
-      self, partitioner)
-
-    // Set a custom serializer if the data is of int or double type.
-    if (classTag[T] == ClassTag.Int) {
-      rdd.setSerializer(new IntVertexBroadcastMsgSerializer)
-    } else if (classTag[T] == ClassTag.Long) {
-      rdd.setSerializer(new LongVertexBroadcastMsgSerializer)
-    } else if (classTag[T] == ClassTag.Double) {
-      rdd.setSerializer(new DoubleVertexBroadcastMsgSerializer)
-    }
-    rdd
-  }
-}
-
-
-private[graphx]
-class MsgRDDFunctions[T: ClassTag](self: RDD[MessageToPartition[T]]) {
-
-  /**
-   * Return a copy of the RDD partitioned using the specified partitioner.
-   */
-  def partitionBy(partitioner: Partitioner): RDD[MessageToPartition[T]] = {
-    new ShuffledRDD[PartitionID, T, T, MessageToPartition[T]](self, partitioner)
-  }
-
-}
-
-private[graphx]
-object MsgRDDFunctions {
-  implicit def rdd2PartitionRDDFunctions[T: ClassTag](rdd: RDD[MessageToPartition[T]]) = {
-    new MsgRDDFunctions(rdd)
-  }
-
-  implicit def rdd2vertexMessageRDDFunctions[T: ClassTag](rdd: RDD[VertexBroadcastMsg[T]]) = {
-    new VertexBroadcastMsgRDDFunctions(rdd)
-  }
-}
-
 private[graphx]
 class VertexRDDFunctions[VD: ClassTag](self: RDD[(VertexId, VD)]) {
   def copartitionWithVertices(partitioner: Partitioner): RDD[(VertexId, VD)] = {
-    val rdd = new ShuffledRDD[VertexId, VD, VD, (VertexId, VD)](self, partitioner)
+    val rdd = new ShuffledRDD[VertexId, VD, VD](self, partitioner)
 
     // Set a custom serializer if the data is of int or double type.
     if (classTag[VD] == ClassTag.Int) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
index 3827ac8d0fd6a..b27485953f719 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
@@ -27,26 +27,13 @@ import org.apache.spark.util.collection.{BitSet, PrimitiveVector}
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
 
-/**
- * A message from the edge partition `pid` to the vertex partition containing `vid` specifying that
- * the edge partition references `vid` in the specified `position` (src, dst, or both).
-*/
-private[graphx]
-class RoutingTableMessage(
-    var vid: VertexId,
-    var pid: PartitionID,
-    var position: Byte)
-  extends Product2[VertexId, (PartitionID, Byte)] with Serializable {
-  override def _1 = vid
-  override def _2 = (pid, position)
-  override def canEqual(that: Any): Boolean = that.isInstanceOf[RoutingTableMessage]
-}
+import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage
 
 private[graphx]
 class RoutingTableMessageRDDFunctions(self: RDD[RoutingTableMessage]) {
   /** Copartition an `RDD[RoutingTableMessage]` with the vertex RDD with the given `partitioner`. */
   def copartitionWithVertices(partitioner: Partitioner): RDD[RoutingTableMessage] = {
-    new ShuffledRDD[VertexId, (PartitionID, Byte), (PartitionID, Byte), RoutingTableMessage](
+    new ShuffledRDD[VertexId, Int, Int](
       self, partitioner).setSerializer(new RoutingTableMessageSerializer)
   }
 }
@@ -62,6 +49,23 @@ object RoutingTableMessageRDDFunctions {
 
 private[graphx]
 object RoutingTablePartition {
+  /**
+   * A message from an edge partition to a vertex specifying the position in which the edge
+   * partition references the vertex (src, dst, or both). The edge partition is encoded in the lower
+   * 30 bytes of the Int, and the position is encoded in the upper 2 bytes of the Int.
+   */
+  type RoutingTableMessage = (VertexId, Int)
+
+  private def toMessage(vid: VertexId, pid: PartitionID, position: Byte): RoutingTableMessage = {
+    val positionUpper2 = position << 30
+    val pidLower30 = pid & 0x3FFFFFFF
+    (vid, positionUpper2 | pidLower30)
+  }
+
+  private def vidFromMessage(msg: RoutingTableMessage): VertexId = msg._1
+  private def pidFromMessage(msg: RoutingTableMessage): PartitionID = msg._2 & 0x3FFFFFFF
+  private def positionFromMessage(msg: RoutingTableMessage): Byte = (msg._2 >> 30).toByte
+
   val empty: RoutingTablePartition = new RoutingTablePartition(Array.empty)
 
   /** Generate a `RoutingTableMessage` for each vertex referenced in `edgePartition`. */
@@ -77,7 +81,9 @@ object RoutingTablePartition {
       map.changeValue(dstId, 0x2, (b: Byte) => (b | 0x2).toByte)
     }
     map.iterator.map { vidAndPosition =>
-      new RoutingTableMessage(vidAndPosition._1, pid, vidAndPosition._2)
+      val vid = vidAndPosition._1
+      val position = vidAndPosition._2
+      toMessage(vid, pid, position)
     }
   }
 
@@ -88,9 +94,12 @@ object RoutingTablePartition {
     val srcFlags = Array.fill(numEdgePartitions)(new PrimitiveVector[Boolean])
     val dstFlags = Array.fill(numEdgePartitions)(new PrimitiveVector[Boolean])
     for (msg <- iter) {
-      pid2vid(msg.pid) += msg.vid
-      srcFlags(msg.pid) += (msg.position & 0x1) != 0
-      dstFlags(msg.pid) += (msg.position & 0x2) != 0
+      val vid = vidFromMessage(msg)
+      val pid = pidFromMessage(msg)
+      val position = positionFromMessage(msg)
+      pid2vid(pid) += vid
+      srcFlags(pid) += (position & 0x1) != 0
+      dstFlags(pid) += (position & 0x2) != 0
     }
 
     new RoutingTablePartition(pid2vid.zipWithIndex.map {
@@ -119,7 +128,7 @@ object RoutingTablePartition {
  */
 private[graphx]
 class RoutingTablePartition(
-    private val routingTable: Array[(Array[VertexId], BitSet, BitSet)]) {
+    private val routingTable: Array[(Array[VertexId], BitSet, BitSet)]) extends Serializable {
   /** The maximum number of edge partitions this `RoutingTablePartition` is built to join with. */
   val numEdgePartitions: Int = routingTable.size
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala
index 033237f597216..3909efcdfc993 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala
@@ -24,9 +24,11 @@ import java.nio.ByteBuffer
 
 import scala.reflect.ClassTag
 
-import org.apache.spark.graphx._
 import org.apache.spark.serializer._
 
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage
+
 private[graphx]
 class RoutingTableMessageSerializer extends Serializer with Serializable {
   override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
@@ -35,10 +37,8 @@ class RoutingTableMessageSerializer extends Serializer with Serializable {
       new ShuffleSerializationStream(s) {
         def writeObject[T: ClassTag](t: T): SerializationStream = {
           val msg = t.asInstanceOf[RoutingTableMessage]
-          writeVarLong(msg.vid, optimizePositive = false)
-          writeUnsignedVarInt(msg.pid)
-          // TODO: Write only the bottom two bits of msg.position
-          s.write(msg.position)
+          writeVarLong(msg._1, optimizePositive = false)
+          writeInt(msg._2)
           this
         }
       }
@@ -47,10 +47,8 @@ class RoutingTableMessageSerializer extends Serializer with Serializable {
       new ShuffleDeserializationStream(s) {
         override def readObject[T: ClassTag](): T = {
           val a = readVarLong(optimizePositive = false)
-          val b = readUnsignedVarInt()
-          val c = s.read()
-          if (c == -1) throw new EOFException
-          new RoutingTableMessage(a, b, c.toByte).asInstanceOf[T]
+          val b = readInt()
+          (a, b).asInstanceOf[T]
         }
       }
   }
@@ -76,78 +74,6 @@ class VertexIdMsgSerializer extends Serializer with Serializable {
   }
 }
 
-/** A special shuffle serializer for VertexBroadcastMessage[Int]. */
-private[graphx]
-class IntVertexBroadcastMsgSerializer extends Serializer with Serializable {
-  override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
-
-    override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
-      def writeObject[T: ClassTag](t: T) = {
-        val msg = t.asInstanceOf[VertexBroadcastMsg[Int]]
-        writeVarLong(msg.vid, optimizePositive = false)
-        writeInt(msg.data)
-        this
-      }
-    }
-
-    override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
-      override def readObject[T: ClassTag](): T = {
-        val a = readVarLong(optimizePositive = false)
-        val b = readInt()
-        new VertexBroadcastMsg[Int](0, a, b).asInstanceOf[T]
-      }
-    }
-  }
-}
-
-/** A special shuffle serializer for VertexBroadcastMessage[Long]. */
-private[graphx]
-class LongVertexBroadcastMsgSerializer extends Serializer with Serializable {
-  override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
-
-    override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
-      def writeObject[T: ClassTag](t: T) = {
-        val msg = t.asInstanceOf[VertexBroadcastMsg[Long]]
-        writeVarLong(msg.vid, optimizePositive = false)
-        writeLong(msg.data)
-        this
-      }
-    }
-
-    override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
-      override def readObject[T: ClassTag](): T = {
-        val a = readVarLong(optimizePositive = false)
-        val b = readLong()
-        new VertexBroadcastMsg[Long](0, a, b).asInstanceOf[T]
-      }
-    }
-  }
-}
-
-/** A special shuffle serializer for VertexBroadcastMessage[Double]. */
-private[graphx]
-class DoubleVertexBroadcastMsgSerializer extends Serializer with Serializable {
-  override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
-
-    override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
-      def writeObject[T: ClassTag](t: T) = {
-        val msg = t.asInstanceOf[VertexBroadcastMsg[Double]]
-        writeVarLong(msg.vid, optimizePositive = false)
-        writeDouble(msg.data)
-        this
-      }
-    }
-
-    override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
-      def readObject[T: ClassTag](): T = {
-        val a = readVarLong(optimizePositive = false)
-        val b = readDouble()
-        new VertexBroadcastMsg[Double](0, a, b).asInstanceOf[T]
-      }
-    }
-  }
-}
-
 /** A special shuffle serializer for AggregationMessage[Int]. */
 private[graphx]
 class IntAggMsgSerializer extends Serializer with Serializable {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
index 34939b24440aa..5ad6390a56c4f 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
@@ -60,7 +60,8 @@ private[graphx] object VertexPartitionBase {
  * `VertexPartitionBaseOpsConstructor` typeclass (for example,
  * [[VertexPartition.VertexPartitionOpsConstructor]]).
  */
-private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag] {
+private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag]
+  extends Serializable {
 
   def index: VertexIdToIndexMap
   def values: Array[VD]
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
index a4f769b294010..b40aa1b417a0f 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
@@ -35,7 +35,7 @@ import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
 private[graphx] abstract class VertexPartitionBaseOps
     [VD: ClassTag, Self[X] <: VertexPartitionBase[X] : VertexPartitionBaseOpsConstructor]
     (self: Self[VD])
-    extends Logging {
+  extends Serializable with Logging {
 
   def withIndex(index: VertexIdToIndexMap): Self[VD]
   def withValues[VD2: ClassTag](values: Array[VD2]): Self[VD2]
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/package.scala b/graphx/src/main/scala/org/apache/spark/graphx/package.scala
index ff17edeaf8f16..6aab28ff05355 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/package.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/package.scala
@@ -30,7 +30,7 @@ package object graphx {
    */
   type VertexId = Long
 
-  /** Integer identifer of a graph partition. */
+  /** Integer identifer of a graph partition. Must be less than 2^30. */
   // TODO: Consider using Char.
   type PartitionID = Int
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
index 635514f09ece0..60149548ab852 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
@@ -100,8 +100,10 @@ object GraphGenerators {
    */
   private def sampleLogNormal(mu: Double, sigma: Double, maxVal: Int): Int = {
     val rand = new Random()
-    val m = math.exp(mu + (sigma * sigma) / 2.0)
-    val s = math.sqrt((math.exp(sigma*sigma) - 1) * math.exp(2*mu + sigma*sigma))
+    val sigmaSq = sigma * sigma
+    val m = math.exp(mu + sigmaSq / 2.0)
+    // expm1 is exp(m)-1 with better accuracy for tiny m
+    val s = math.sqrt(math.expm1(sigmaSq) * math.exp(2*mu + sigmaSq))
     // Z ~ N(0, 1)
     var X: Double = maxVal
 
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala
index 91caa6b605a1e..864cb1fdf0022 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala
@@ -26,75 +26,11 @@ import org.scalatest.FunSuite
 
 import org.apache.spark._
 import org.apache.spark.graphx.impl._
-import org.apache.spark.graphx.impl.MsgRDDFunctions._
 import org.apache.spark.serializer.SerializationStream
 
 
 class SerializerSuite extends FunSuite with LocalSparkContext {
 
-  test("IntVertexBroadcastMsgSerializer") {
-    val outMsg = new VertexBroadcastMsg[Int](3, 4, 5)
-    val bout = new ByteArrayOutputStream
-    val outStrm = new IntVertexBroadcastMsgSerializer().newInstance().serializeStream(bout)
-    outStrm.writeObject(outMsg)
-    outStrm.writeObject(outMsg)
-    bout.flush()
-    val bin = new ByteArrayInputStream(bout.toByteArray)
-    val inStrm = new IntVertexBroadcastMsgSerializer().newInstance().deserializeStream(bin)
-    val inMsg1: VertexBroadcastMsg[Int] = inStrm.readObject()
-    val inMsg2: VertexBroadcastMsg[Int] = inStrm.readObject()
-    assert(outMsg.vid === inMsg1.vid)
-    assert(outMsg.vid === inMsg2.vid)
-    assert(outMsg.data === inMsg1.data)
-    assert(outMsg.data === inMsg2.data)
-
-    intercept[EOFException] {
-      inStrm.readObject()
-    }
-  }
-
-  test("LongVertexBroadcastMsgSerializer") {
-    val outMsg = new VertexBroadcastMsg[Long](3, 4, 5)
-    val bout = new ByteArrayOutputStream
-    val outStrm = new LongVertexBroadcastMsgSerializer().newInstance().serializeStream(bout)
-    outStrm.writeObject(outMsg)
-    outStrm.writeObject(outMsg)
-    bout.flush()
-    val bin = new ByteArrayInputStream(bout.toByteArray)
-    val inStrm = new LongVertexBroadcastMsgSerializer().newInstance().deserializeStream(bin)
-    val inMsg1: VertexBroadcastMsg[Long] = inStrm.readObject()
-    val inMsg2: VertexBroadcastMsg[Long] = inStrm.readObject()
-    assert(outMsg.vid === inMsg1.vid)
-    assert(outMsg.vid === inMsg2.vid)
-    assert(outMsg.data === inMsg1.data)
-    assert(outMsg.data === inMsg2.data)
-
-    intercept[EOFException] {
-      inStrm.readObject()
-    }
-  }
-
-  test("DoubleVertexBroadcastMsgSerializer") {
-    val outMsg = new VertexBroadcastMsg[Double](3, 4, 5.0)
-    val bout = new ByteArrayOutputStream
-    val outStrm = new DoubleVertexBroadcastMsgSerializer().newInstance().serializeStream(bout)
-    outStrm.writeObject(outMsg)
-    outStrm.writeObject(outMsg)
-    bout.flush()
-    val bin = new ByteArrayInputStream(bout.toByteArray)
-    val inStrm = new DoubleVertexBroadcastMsgSerializer().newInstance().deserializeStream(bin)
-    val inMsg1: VertexBroadcastMsg[Double] = inStrm.readObject()
-    val inMsg2: VertexBroadcastMsg[Double] = inStrm.readObject()
-    assert(outMsg.vid === inMsg1.vid)
-    assert(outMsg.vid === inMsg2.vid)
-    assert(outMsg.data === inMsg1.data)
-    assert(outMsg.data === inMsg2.data)
-
-    intercept[EOFException] {
-      inStrm.readObject()
-    }
-  }
-
   test("IntAggMsgSerializer") {
     val outMsg = (4: VertexId, 5)
     val bout = new ByteArrayOutputStream
@@ -152,15 +88,6 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
     }
   }
 
-  test("TestShuffleVertexBroadcastMsg") {
-    withSpark { sc =>
-      val bmsgs = sc.parallelize(0 until 100, 10).map { pid =>
-        new VertexBroadcastMsg[Int](pid, pid, pid)
-      }
-      bmsgs.partitionBy(new HashPartitioner(3)).collect()
-    }
-  }
-
   test("variable long encoding") {
     def testVarLongEncoding(v: Long, optimizePositive: Boolean) {
       val bout = new ByteArrayOutputStream
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
index 28fd112f2b124..9d00f76327e4c 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
@@ -23,6 +23,7 @@ import scala.util.Random
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkConf
+import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.serializer.KryoSerializer
 
 import org.apache.spark.graphx._
@@ -124,18 +125,21 @@ class EdgePartitionSuite extends FunSuite {
     assert(ep.numActives == Some(2))
   }
 
-  test("Kryo serialization") {
+  test("serialization") {
     val aList = List((0, 1, 0), (1, 0, 0), (1, 2, 0), (5, 4, 0), (5, 5, 0))
     val a: EdgePartition[Int, Int] = makeEdgePartition(aList)
-    val conf = new SparkConf()
+    val javaSer = new JavaSerializer(new SparkConf())
+    val kryoSer = new KryoSerializer(new SparkConf()
       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-      .set("spark.kryo.registrator", "org.apache.spark.graphx.GraphKryoRegistrator")
-    val s = new KryoSerializer(conf).newInstance()
-    val aSer: EdgePartition[Int, Int] = s.deserialize(s.serialize(a))
-    assert(aSer.srcIds.toList === a.srcIds.toList)
-    assert(aSer.dstIds.toList === a.dstIds.toList)
-    assert(aSer.data.toList === a.data.toList)
-    assert(aSer.index != null)
-    assert(aSer.vertices.iterator.toSet === a.vertices.iterator.toSet)
+      .set("spark.kryo.registrator", "org.apache.spark.graphx.GraphKryoRegistrator"))
+
+    for (ser <- List(javaSer, kryoSer); s = ser.newInstance()) {
+      val aSer: EdgePartition[Int, Int] = s.deserialize(s.serialize(a))
+      assert(aSer.srcIds.toList === a.srcIds.toList)
+      assert(aSer.dstIds.toList === a.dstIds.toList)
+      assert(aSer.data.toList === a.data.toList)
+      assert(aSer.index != null)
+      assert(aSer.vertices.iterator.toSet === a.vertices.iterator.toSet)
+    }
   }
 }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
index 8bf1384d514c1..f9e771a900013 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.graphx.impl
 
-import org.apache.spark.graphx._
 import org.scalatest.FunSuite
 
+import org.apache.spark.SparkConf
+import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.serializer.KryoSerializer
+
+import org.apache.spark.graphx._
+
 class VertexPartitionSuite extends FunSuite {
 
   test("isDefined, filter") {
@@ -116,4 +121,17 @@ class VertexPartitionSuite extends FunSuite {
     assert(vp3.index.getPos(2) === -1)
   }
 
+  test("serialization") {
+    val verts = Set((0L, 1), (1L, 1), (2L, 1))
+    val vp = VertexPartition(verts.iterator)
+    val javaSer = new JavaSerializer(new SparkConf())
+    val kryoSer = new KryoSerializer(new SparkConf()
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+      .set("spark.kryo.registrator", "org.apache.spark.graphx.GraphKryoRegistrator"))
+
+    for (ser <- List(javaSer, kryoSer); s = ser.newInstance()) {
+      val vpSer: VertexPartition[Int] = s.deserialize(s.serialize(vp))
+      assert(vpSer.iterator.toSet === verts)
+    }
+  }
 }
diff --git a/make-distribution.sh b/make-distribution.sh
index 94b473bf91cd3..f7a6a9d838bb6 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -23,21 +23,6 @@
 # The distribution contains fat (assembly) jars that include the Scala library,
 # so it is completely self contained.
 # It does not contain source or *.class files.
-#
-# Optional Arguments
-#      --tgz: Additionally creates spark-$VERSION-bin.tar.gz
-#      --hadoop VERSION: Builds against specified version of Hadoop.
-#      --with-yarn: Enables support for Hadoop YARN.
-#      --with-hive: Enable support for reading Hive tables.
-#      --name: A moniker for the release target. Defaults to the Hadoop verison.
-#
-# Recommended deploy/testing procedure (standalone mode):
-# 1) Rsync / deploy the dist/ dir to one host
-# 2) cd to deploy dir; ./sbin/start-master.sh
-# 3) Verify master is up by visiting web page, ie http://master-ip:8080.  Note the spark:// URL.
-# 4) ./sbin/start-slave.sh 1 <<spark:// URL>>
-# 5) ./bin/spark-shell --master spark://my-master-ip:7077
-#
 
 set -o pipefail
 set -e
@@ -46,26 +31,35 @@ set -e
 FWDIR="$(cd `dirname $0`; pwd)"
 DISTDIR="$FWDIR/dist"
 
-# Initialize defaults
-SPARK_HADOOP_VERSION=1.0.4
-SPARK_YARN=false
-SPARK_HIVE=false
 SPARK_TACHYON=false
 MAKE_TGZ=false
 NAME=none
 
+function exit_with_usage {
+  echo "make-distribution.sh - tool for making binary distributions of Spark"
+  echo ""
+  echo "usage:"
+  echo "./make-distribution.sh [--name] [--tgz] [--with-tachyon] <maven build options>"
+  echo "See Spark's \"Building with Maven\" doc for correct Maven options."
+  echo ""
+  exit 1
+}
+
 # Parse arguments
 while (( "$#" )); do
   case $1 in
     --hadoop)
-      SPARK_HADOOP_VERSION="$2"
-      shift
+      echo "Error: '--hadoop' is no longer supported:"
+      echo "Error: use Maven options -Phadoop.version and -Pyarn.version"
+      exit_with_usage
       ;;
     --with-yarn)
-      SPARK_YARN=true
+      echo "Error: '--with-yarn' is no longer supported, use Maven option -Pyarn"
+      exit_with_usage
       ;;
     --with-hive)
-      SPARK_HIVE=true
+      echo "Error: '--with-hive' is no longer supported, use Maven option -Phive"
+      exit_with_usage
       ;;
     --skip-java-test)
       SKIP_JAVA_TEST=true
@@ -80,6 +74,12 @@ while (( "$#" )); do
       NAME="$2"
       shift
       ;;
+    --help)
+      exit_with_usage
+      ;;
+    *)
+      break
+      ;;
   esac
   shift
 done
@@ -128,7 +128,7 @@ if [[ ! "$JAVA_VERSION" =~ "1.6" && -z "$SKIP_JAVA_TEST" ]]; then
   if [[ ! $REPLY =~ ^[Yy]$ ]]; then
     echo "Okay, exiting."
     exit 1
-  fi 
+  fi
 fi
 
 if [ "$NAME" == "none" ]; then
@@ -143,14 +143,6 @@ else
   echo "Making distribution for Spark $VERSION in $DISTDIR..."
 fi
 
-echo "Hadoop version set to $SPARK_HADOOP_VERSION"
-echo "Release name set to $NAME"
-if [ "$SPARK_YARN" == "true" ]; then
-  echo "YARN enabled"
-else
-  echo "YARN disabled"
-fi
-
 if [ "$SPARK_TACHYON" == "true" ]; then
   echo "Tachyon Enabled"
 else
@@ -158,37 +150,16 @@ else
 fi
 
 # Build uber fat JAR
-cd $FWDIR
+cd "$FWDIR"
 
 export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
 
-BUILD_COMMAND="mvn clean package"
-
-# Use special profiles for hadoop versions 0.23.x, 2.2.x, 2.3.x, 2.4.x
-if [[ "$SPARK_HADOOP_VERSION" =~ ^0\.23\. ]]; then BUILD_COMMAND="$BUILD_COMMAND -Phadoop-0.23"; fi
-if [[ "$SPARK_HADOOP_VERSION" =~ ^2\.2\. ]]; then BUILD_COMMAND="$BUILD_COMMAND -Phadoop-2.2"; fi
-if [[ "$SPARK_HADOOP_VERSION" =~ ^2\.3\. ]]; then BUILD_COMMAND="$BUILD_COMMAND -Phadoop-2.3"; fi
-if [[ "$SPARK_HADOOP_VERSION" =~ ^2\.4\. ]]; then BUILD_COMMAND="$BUILD_COMMAND -Phadoop-2.4"; fi
-if [[ "$SPARK_HIVE" == "true" ]]; then BUILD_COMMAND="$BUILD_COMMAND -Phive"; fi
-if [[ "$SPARK_YARN" == "true" ]]; then
-  # For hadoop versions 0.23.x to 2.1.x, use the yarn-alpha profile
-  if [[ "$SPARK_HADOOP_VERSION" =~ ^0\.2[3-9]\. ]] ||
-     [[ "$SPARK_HADOOP_VERSION" =~ ^0\.[3-9][0-9]\. ]] ||
-     [[ "$SPARK_HADOOP_VERSION" =~ ^1\.[0-9]\. ]] ||
-     [[ "$SPARK_HADOOP_VERSION" =~ ^2\.[0-1]\. ]]; then
-    BUILD_COMMAND="$BUILD_COMMAND -Pyarn-alpha"
-  # For hadoop versions 2.2+, use the yarn profile
-  elif [[ "$SPARK_HADOOP_VERSION" =~ ^2.[2-9]. ]]; then
-    BUILD_COMMAND="$BUILD_COMMAND -Pyarn"
-  fi
-  BUILD_COMMAND="$BUILD_COMMAND -Dyarn.version=$SPARK_HADOOP_VERSION"
-fi
-BUILD_COMMAND="$BUILD_COMMAND -Dhadoop.version=$SPARK_HADOOP_VERSION"
-BUILD_COMMAND="$BUILD_COMMAND -DskipTests"
+BUILD_COMMAND="mvn clean package -DskipTests $@"
 
 # Actually build the jar
 echo -e "\nBuilding with..."
 echo -e "\$ $BUILD_COMMAND\n"
+
 ${BUILD_COMMAND}
 
 # Make directories
@@ -197,22 +168,22 @@ mkdir -p "$DISTDIR/lib"
 echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
 
 # Copy jars
-cp $FWDIR/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
-cp $FWDIR/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
+cp "$FWDIR"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
+cp "$FWDIR"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
 
 # Copy example sources (needed for python and SQL)
 mkdir -p "$DISTDIR/examples/src/main"
-cp -r $FWDIR/examples/src/main "$DISTDIR/examples/src/" 
+cp -r "$FWDIR"/examples/src/main "$DISTDIR/examples/src/"
 
 if [ "$SPARK_HIVE" == "true" ]; then
-  cp $FWDIR/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
+  cp "$FWDIR"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
 fi
 
 # Copy license and ASF files
 cp "$FWDIR/LICENSE" "$DISTDIR"
 cp "$FWDIR/NOTICE" "$DISTDIR"
 
-if [ -e $FWDIR/CHANGES.txt ]; then
+if [ -e "$FWDIR"/CHANGES.txt ]; then
   cp "$FWDIR/CHANGES.txt" "$DISTDIR"
 fi
 
@@ -228,7 +199,7 @@ cp -r "$FWDIR/ec2" "$DISTDIR"
 
 # Download and copy in tachyon, if requested
 if [ "$SPARK_TACHYON" == "true" ]; then
-  TACHYON_VERSION="0.4.1"
+  TACHYON_VERSION="0.5.0"
   TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/tachyon-${TACHYON_VERSION}-bin.tar.gz"
 
   TMPD=`mktemp -d 2>/dev/null || mktemp -d -t 'disttmp'`
diff --git a/mllib/pom.xml b/mllib/pom.xml
index b622f96dd7901..9a33bd1cf6ad1 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-mllib_2.10</artifactId>
+  <properties>
+    <sbt.project.name>mllib</sbt.project.name>
+  </properties>  
   <packaging>jar</packaging>
   <name>Spark Project ML Library</name>
   <url>http://spark.apache.org/</url>
@@ -37,6 +40,11 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-server</artifactId>
@@ -57,6 +65,10 @@
           <groupId>junit</groupId>
           <artifactId>junit</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-math3</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
@@ -69,12 +81,30 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
   </dependencies>
+  <profiles>
+    <profile>
+      <id>netlib-lgpl</id>
+      <dependencies>
+        <dependency>
+          <groupId>com.github.fommil.netlib</groupId>
+          <artifactId>all</artifactId>
+          <version>1.1.2</version>
+          <type>pom</type>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index c44173793b39a..1d5d3762ed8e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -19,15 +19,28 @@ package org.apache.spark.mllib.api.python
 
 import java.nio.{ByteBuffer, ByteOrder}
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
+import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.mllib.classification._
 import org.apache.spark.mllib.clustering._
 import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.optimization._
+import org.apache.spark.mllib.linalg.{Matrix, SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.random.{RandomRDDGenerators => RG}
 import org.apache.spark.mllib.recommendation._
 import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.tree.configuration.Strategy
+import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -54,6 +67,13 @@ class PythonMLLibAPI extends Serializable {
     }
   }
 
+  private[python] def deserializeDouble(bytes: Array[Byte], offset: Int = 0): Double = {
+    require(bytes.length - offset == 8, "Wrong size byte array for Double")
+    val bb = ByteBuffer.wrap(bytes, offset, bytes.length - offset)
+    bb.order(ByteOrder.nativeOrder())
+    bb.getDouble
+  }
+
   private def deserializeDenseVector(bytes: Array[Byte], offset: Int = 0): Vector = {
     val packetLength = bytes.length - offset
     require(packetLength >= 5, "Byte array too short")
@@ -89,6 +109,22 @@ class PythonMLLibAPI extends Serializable {
     Vectors.sparse(size, indices, values)
   }
 
+  /**
+   * Returns an 8-byte array for the input Double.
+   *
+   * Note: we currently do not use a magic byte for double for storage efficiency.
+   * This should be reconsidered when we add Ser/De for other 8-byte types (e.g. Long), for safety.
+   * The corresponding deserializer, deserializeDouble, needs to be modified as well if the
+   * serialization scheme changes.
+   */
+  private[python] def serializeDouble(double: Double): Array[Byte] = {
+    val bytes = new Array[Byte](8)
+    val bb = ByteBuffer.wrap(bytes)
+    bb.order(ByteOrder.nativeOrder())
+    bb.putDouble(double)
+    bytes
+  }
+
   private def serializeDenseVector(doubles: Array[Double]): Array[Byte] = {
     val len = doubles.length
     val bytes = new Array[Byte](5 + 8 * len)
@@ -202,7 +238,7 @@ class PythonMLLibAPI extends Serializable {
       jsc: JavaSparkContext,
       path: String,
       minPartitions: Int): JavaRDD[Array[Byte]] =
-    MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions).map(serializeLabeledPoint).toJavaRDD()
+    MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions).map(serializeLabeledPoint)
 
   private def trainRegressionModel(
       trainFunc: (RDD[LabeledPoint], Vector) => GeneralizedLinearModel,
@@ -225,15 +261,27 @@ class PythonMLLibAPI extends Serializable {
       numIterations: Int,
       stepSize: Double,
       miniBatchFraction: Double,
-      initialWeightsBA: Array[Byte]): java.util.List[java.lang.Object] = {
+      initialWeightsBA: Array[Byte], 
+      regParam: Double,
+      regType: String,
+      intercept: Boolean): java.util.List[java.lang.Object] = {
+    val lrAlg = new LinearRegressionWithSGD()
+    lrAlg.setIntercept(intercept)
+    lrAlg.optimizer
+      .setNumIterations(numIterations)
+      .setRegParam(regParam)
+      .setStepSize(stepSize)
+    if (regType == "l2") {
+      lrAlg.optimizer.setUpdater(new SquaredL2Updater)
+    } else if (regType == "l1") {
+      lrAlg.optimizer.setUpdater(new L1Updater)
+    } else if (regType != "none") {
+      throw new java.lang.IllegalArgumentException("Invalid value for 'regType' parameter."
+        + " Can only be initialized using the following string values: [l1, l2, none].")
+    }
     trainRegressionModel(
       (data, initialWeights) =>
-        LinearRegressionWithSGD.train(
-          data,
-          numIterations,
-          stepSize,
-          miniBatchFraction,
-          initialWeights),
+        lrAlg.run(data, initialWeights),
       dataBytesJRDD,
       initialWeightsBA)
   }
@@ -430,4 +478,201 @@ class PythonMLLibAPI extends Serializable {
     val ratings = ratingsBytesJRDD.rdd.map(unpackRating)
     ALS.trainImplicit(ratings, rank, iterations, lambda, blocks, alpha)
   }
+
+  /**
+   * Java stub for Python mllib DecisionTree.train().
+   * This stub returns a handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on exit;
+   * see the Py4J documentation.
+   * @param dataBytesJRDD  Training data
+   * @param categoricalFeaturesInfoJMap  Categorical features info, as Java map
+   */
+  def trainDecisionTreeModel(
+      dataBytesJRDD: JavaRDD[Array[Byte]],
+      algoStr: String,
+      numClasses: Int,
+      categoricalFeaturesInfoJMap: java.util.Map[Int, Int],
+      impurityStr: String,
+      maxDepth: Int,
+      maxBins: Int): DecisionTreeModel = {
+
+    val data = dataBytesJRDD.rdd.map(deserializeLabeledPoint)
+
+    val algo: Algo = algoStr match {
+      case "classification" => Classification
+      case "regression" => Regression
+      case _ => throw new IllegalArgumentException(s"Bad algoStr parameter: $algoStr")
+    }
+    val impurity: Impurity = impurityStr match {
+      case "gini" => Gini
+      case "entropy" => Entropy
+      case "variance" => Variance
+      case _ => throw new IllegalArgumentException(s"Bad impurityStr parameter: $impurityStr")
+    }
+
+    val strategy = new Strategy(
+      algo = algo,
+      impurity = impurity,
+      maxDepth = maxDepth,
+      numClassesForClassification = numClasses,
+      maxBins = maxBins,
+      categoricalFeaturesInfo = categoricalFeaturesInfoJMap.asScala.toMap)
+
+    DecisionTree.train(data, strategy)
+  }
+
+  /**
+   * Predict the label of the given data point.
+   * This is a Java stub for python DecisionTreeModel.predict()
+   *
+   * @param featuresBytes Serialized feature vector for data point
+   * @return predicted label
+   */
+  def predictDecisionTreeModel(
+      model: DecisionTreeModel,
+      featuresBytes: Array[Byte]): Double = {
+    val features: Vector = deserializeDoubleVector(featuresBytes)
+    model.predict(features)
+  }
+
+  /**
+   * Predict the labels of the given data points.
+   * This is a Java stub for python DecisionTreeModel.predict()
+   *
+   * @param dataJRDD A JavaRDD with serialized feature vectors
+   * @return JavaRDD of serialized predictions
+   */
+  def predictDecisionTreeModel(
+      model: DecisionTreeModel,
+      dataJRDD: JavaRDD[Array[Byte]]): JavaRDD[Array[Byte]] = {
+    val data = dataJRDD.rdd.map(xBytes => deserializeDoubleVector(xBytes))
+    model.predict(data).map(serializeDouble)
+  }
+
+  /**
+   * Java stub for mllib Statistics.corr(X: RDD[Vector], method: String).
+   * Returns the correlation matrix serialized into a byte array understood by deserializers in
+   * pyspark.
+   */
+  def corr(X: JavaRDD[Array[Byte]], method: String): Array[Byte] = {
+    val inputMatrix = X.rdd.map(deserializeDoubleVector(_))
+    val result = Statistics.corr(inputMatrix, getCorrNameOrDefault(method))
+    serializeDoubleMatrix(to2dArray(result))
+  }
+
+  /**
+   * Java stub for mllib Statistics.corr(x: RDD[Double], y: RDD[Double], method: String).
+   */
+  def corr(x: JavaRDD[Array[Byte]], y: JavaRDD[Array[Byte]], method: String): Double = {
+    val xDeser = x.rdd.map(deserializeDouble(_))
+    val yDeser = y.rdd.map(deserializeDouble(_))
+    Statistics.corr(xDeser, yDeser, getCorrNameOrDefault(method))
+  }
+
+  // used by the corr methods to retrieve the name of the correlation method passed in via pyspark
+  private def getCorrNameOrDefault(method: String) = {
+    if (method == null) CorrelationNames.defaultCorrName else method
+  }
+
+  // Reformat a Matrix into Array[Array[Double]] for serialization
+  private[python] def to2dArray(matrix: Matrix): Array[Array[Double]] = {
+    val values = matrix.toArray
+    Array.tabulate(matrix.numRows, matrix.numCols)((i, j) => values(i + j * matrix.numRows))
+  }
+
+  // Used by the *RDD methods to get default seed if not passed in from pyspark
+  private def getSeedOrDefault(seed: java.lang.Long): Long = {
+    if (seed == null) Utils.random.nextLong else seed
+  }
+
+  // Used by *RDD methods to get default numPartitions if not passed in from pyspark
+  private def getNumPartitionsOrDefault(numPartitions: java.lang.Integer,
+      jsc: JavaSparkContext): Int = {
+    if (numPartitions == null) {
+      jsc.sc.defaultParallelism
+    } else {
+      numPartitions
+    }
+  }
+
+  // Note: for the following methods, numPartitions and seed are boxed to allow nulls to be passed
+  // in for either argument from pyspark
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.uniformRDD()
+   */
+  def uniformRDD(jsc: JavaSparkContext,
+      size: Long,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Array[Byte]] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.uniformRDD(jsc.sc, size, parts, s).map(serializeDouble)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.normalRDD()
+   */
+  def normalRDD(jsc: JavaSparkContext,
+      size: Long,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Array[Byte]] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.normalRDD(jsc.sc, size, parts, s).map(serializeDouble)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.poissonRDD()
+   */
+  def poissonRDD(jsc: JavaSparkContext,
+      mean: Double,
+      size: Long,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Array[Byte]] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.poissonRDD(jsc.sc, mean, size, parts, s).map(serializeDouble)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.uniformVectorRDD()
+   */
+  def uniformVectorRDD(jsc: JavaSparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Array[Byte]] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.uniformVectorRDD(jsc.sc, numRows, numCols, parts, s).map(serializeDoubleVector)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.normalVectorRDD()
+   */
+  def normalVectorRDD(jsc: JavaSparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Array[Byte]] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.normalVectorRDD(jsc.sc, numRows, numCols, parts, s).map(serializeDoubleVector)
+  }
+
+  /**
+   * Java stub for Python mllib RandomRDDGenerators.poissonVectorRDD()
+   */
+  def poissonVectorRDD(jsc: JavaSparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: java.lang.Integer,
+      seed: java.lang.Long): JavaRDD[Array[Byte]] = {
+    val parts = getNumPartitionsOrDefault(numPartitions, jsc)
+    val s = getSeedOrDefault(seed)
+    RG.poissonVectorRDD(jsc.sc, mean, numRows, numCols, parts, s).map(serializeDoubleVector)
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 90aa8ac998ba9..2242329b7918e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  */
-class LogisticRegressionModel private[mllib] (
+class LogisticRegressionModel (
     override val weights: Vector,
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index b6e0c4a80e27b..6c7be0a4f1dcb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -54,7 +54,13 @@ class NaiveBayesModel private[mllib] (
     }
   }
 
-  override def predict(testData: RDD[Vector]): RDD[Double] = testData.map(predict)
+  override def predict(testData: RDD[Vector]): RDD[Double] = {
+    val bcModel = testData.context.broadcast(this)
+    testData.mapPartitions { iter =>
+      val model = bcModel.value
+      iter.map(model.predict)
+    }
+  }
 
   override def predict(testData: Vector): Double = {
     labels(brzArgmax(brzPi + brzTheta * testData.toBreeze))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 316ecd713b715..80f8a1b2f1e84 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  */
-class SVMModel private[mllib] (
+class SVMModel (
     override val weights: Vector,
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index de22fbb6ffc10..db425d866bbad 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -165,18 +165,21 @@ class KMeans private (
       val activeCenters = activeRuns.map(r => centers(r)).toArray
       val costAccums = activeRuns.map(_ => sc.accumulator(0.0))
 
+      val bcActiveCenters = sc.broadcast(activeCenters)
+
       // Find the sum and count of points mapping to each center
       val totalContribs = data.mapPartitions { points =>
-        val runs = activeCenters.length
-        val k = activeCenters(0).length
-        val dims = activeCenters(0)(0).vector.length
+        val thisActiveCenters = bcActiveCenters.value
+        val runs = thisActiveCenters.length
+        val k = thisActiveCenters(0).length
+        val dims = thisActiveCenters(0)(0).vector.length
 
         val sums = Array.fill(runs, k)(BDV.zeros[Double](dims).asInstanceOf[BV[Double]])
         val counts = Array.fill(runs, k)(0L)
 
         points.foreach { point =>
           (0 until runs).foreach { i =>
-            val (bestCenter, cost) = KMeans.findClosest(activeCenters(i), point)
+            val (bestCenter, cost) = KMeans.findClosest(thisActiveCenters(i), point)
             costAccums(i) += cost
             sums(i)(bestCenter) += point.vector
             counts(i)(bestCenter) += 1
@@ -264,16 +267,17 @@ class KMeans private (
     // to their squared distance from that run's current centers
     var step = 0
     while (step < initializationSteps) {
+      val bcCenters = data.context.broadcast(centers)
       val sumCosts = data.flatMap { point =>
         (0 until runs).map { r =>
-          (r, KMeans.pointCost(centers(r), point))
+          (r, KMeans.pointCost(bcCenters.value(r), point))
         }
       }.reduceByKey(_ + _).collectAsMap()
       val chosen = data.mapPartitionsWithIndex { (index, points) =>
         val rand = new XORShiftRandom(seed ^ (step << 16) ^ index)
         points.flatMap { p =>
           (0 until runs).filter { r =>
-            rand.nextDouble() < 2.0 * KMeans.pointCost(centers(r), p) * k / sumCosts(r)
+            rand.nextDouble() < 2.0 * KMeans.pointCost(bcCenters.value(r), p) * k / sumCosts(r)
           }.map((_, p))
         }
       }.collect()
@@ -286,9 +290,10 @@ class KMeans private (
     // Finally, we might have a set of more than k candidate centers for each run; weigh each
     // candidate by the number of points in the dataset mapping to it and run a local k-means++
     // on the weighted centers to pick just k of them
+    val bcCenters = data.context.broadcast(centers)
     val weightMap = data.flatMap { p =>
       (0 until runs).map { r =>
-        ((r, KMeans.findClosest(centers(r), p)._1), 1.0)
+        ((r, KMeans.findClosest(bcCenters.value(r), p)._1), 1.0)
       }
     }.reduceByKey(_ + _).collectAsMap()
     val finalCenters = (0 until runs).map { r =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index fba21aefaaacd..5823cb6e52e7f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -38,7 +38,8 @@ class KMeansModel private[mllib] (val clusterCenters: Array[Vector]) extends Ser
   /** Maps given points to their cluster indices. */
   def predict(points: RDD[Vector]): RDD[Int] = {
     val centersWithNorm = clusterCentersWithNorm
-    points.map(p => KMeans.findClosest(centersWithNorm, new BreezeVectorWithNorm(p))._1)
+    val bcCentersWithNorm = points.context.broadcast(centersWithNorm)
+    points.map(p => KMeans.findClosest(bcCentersWithNorm.value, new BreezeVectorWithNorm(p))._1)
   }
 
   /** Maps given points to their cluster indices. */
@@ -51,7 +52,8 @@ class KMeansModel private[mllib] (val clusterCenters: Array[Vector]) extends Ser
    */
   def computeCost(data: RDD[Vector]): Double = {
     val centersWithNorm = clusterCentersWithNorm
-    data.map(p => KMeans.pointCost(centersWithNorm, new BreezeVectorWithNorm(p))).sum()
+    val bcCentersWithNorm = data.context.broadcast(centersWithNorm)
+    data.map(p => KMeans.pointCost(bcCentersWithNorm.value, new BreezeVectorWithNorm(p))).sum()
   }
 
   private def clusterCentersWithNorm: Iterable[BreezeVectorWithNorm] =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
index 2e3a4ce783de7..f0722d7c14a46 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
@@ -59,7 +59,13 @@ private[mllib] object LocalKMeans extends Logging {
         cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j))
         j += 1
       }
-      centers(i) = points(j-1).toDense
+      if (j == 0) {
+        logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." +
+          s" Using duplicate point for center k = $i.")
+        centers(i) = points(0).toDense
+      } else {
+        centers(i) = points(j - 1).toDense
+      }
     }
 
     // Run up to maxIterations iterations of Lloyd's algorithm
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 079743742d86d..1af40de2c7fcf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -103,11 +103,11 @@ class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends
       mergeValue = (c: BinaryLabelCounter, label: Double) => c += label,
       mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2
     ).sortByKey(ascending = false)
-    val agg = counts.values.mapPartitions({ iter =>
+    val agg = counts.values.mapPartitions { iter =>
       val agg = new BinaryLabelCounter()
       iter.foreach(agg += _)
       Iterator(agg)
-    }, preservesPartitioning = true).collect()
+    }.collect()
     val partitionwiseCumulativeCounts =
       agg.scanLeft(new BinaryLabelCounter())(
         (agg: BinaryLabelCounter, c: BinaryLabelCounter) => agg.clone() += c)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
new file mode 100644
index 0000000000000..666362ae6739a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import scala.collection.Map
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.{Matrices, Matrix}
+import org.apache.spark.rdd.RDD
+
+/**
+ * ::Experimental::
+ * Evaluator for multiclass classification.
+ *
+ * @param predictionAndLabels an RDD of (prediction, label) pairs.
+ */
+@Experimental
+class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
+
+  private lazy val labelCountByClass: Map[Double, Long] = predictionAndLabels.values.countByValue()
+  private lazy val labelCount: Long = labelCountByClass.values.sum
+  private lazy val tpByClass: Map[Double, Int] = predictionAndLabels
+    .map { case (prediction, label) =>
+      (label, if (label == prediction) 1 else 0)
+    }.reduceByKey(_ + _)
+    .collectAsMap()
+  private lazy val fpByClass: Map[Double, Int] = predictionAndLabels
+    .map { case (prediction, label) =>
+      (prediction, if (prediction != label) 1 else 0)
+    }.reduceByKey(_ + _)
+    .collectAsMap()
+  private lazy val confusions = predictionAndLabels
+    .map { case (prediction, label) =>
+      ((label, prediction), 1)
+    }.reduceByKey(_ + _)
+    .collectAsMap()
+
+  /**
+   * Returns confusion matrix:
+   * predicted classes are in columns,
+   * they are ordered by class label ascending,
+   * as in "labels"
+   */
+  def confusionMatrix: Matrix = {
+    val n = labels.size
+    val values = Array.ofDim[Double](n * n)
+    var i = 0
+    while (i < n) {
+      var j = 0
+      while (j < n) {
+        values(i + j * n) = confusions.getOrElse((labels(i), labels(j)), 0).toDouble
+        j += 1
+      }
+      i += 1
+    }
+    Matrices.dense(n, n, values)
+  }
+
+  /**
+   * Returns true positive rate for a given label (category)
+   * @param label the label.
+   */
+  def truePositiveRate(label: Double): Double = recall(label)
+
+  /**
+   * Returns false positive rate for a given label (category)
+   * @param label the label.
+   */
+  def falsePositiveRate(label: Double): Double = {
+    val fp = fpByClass.getOrElse(label, 0)
+    fp.toDouble / (labelCount - labelCountByClass(label))
+  }
+
+  /**
+   * Returns precision for a given label (category)
+   * @param label the label.
+   */
+  def precision(label: Double): Double = {
+    val tp = tpByClass(label)
+    val fp = fpByClass.getOrElse(label, 0)
+    if (tp + fp == 0) 0 else tp.toDouble / (tp + fp)
+  }
+
+  /**
+   * Returns recall for a given label (category)
+   * @param label the label.
+   */
+  def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label)
+
+  /**
+   * Returns f-measure for a given label (category)
+   * @param label the label.
+   * @param beta the beta parameter.
+   */
+  def fMeasure(label: Double, beta: Double): Double = {
+    val p = precision(label)
+    val r = recall(label)
+    val betaSqrd = beta * beta
+    if (p + r == 0) 0 else (1 + betaSqrd) * p * r / (betaSqrd * p + r)
+  }
+
+  /**
+   * Returns f1-measure for a given label (category)
+   * @param label the label.
+   */
+  def fMeasure(label: Double): Double = fMeasure(label, 1.0)
+
+  /**
+   * Returns precision
+   */
+  lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
+
+  /**
+   * Returns recall
+   * (equals to precision for multiclass classifier
+   * because sum of all false positives is equal to sum
+   * of all false negatives)
+   */
+  lazy val recall: Double = precision
+
+  /**
+   * Returns f-measure
+   * (equals to precision and recall because precision equals recall)
+   */
+  lazy val fMeasure: Double = precision
+
+  /**
+   * Returns weighted true positive rate
+   * (equals to precision, recall and f-measure)
+   */
+  lazy val weightedTruePositiveRate: Double = weightedRecall
+
+  /**
+   * Returns weighted false positive rate
+   */
+  lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) =>
+    falsePositiveRate(category) * count.toDouble / labelCount
+  }.sum
+
+  /**
+   * Returns weighted averaged recall
+   * (equals to precision, recall and f-measure)
+   */
+  lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) =>
+    recall(category) * count.toDouble / labelCount
+  }.sum
+
+  /**
+   * Returns weighted averaged precision
+   */
+  lazy val weightedPrecision: Double = labelCountByClass.map { case (category, count) =>
+    precision(category) * count.toDouble / labelCount
+  }.sum
+
+  /**
+   * Returns weighted averaged f-measure
+   * @param beta the beta parameter.
+   */
+  def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) =>
+    fMeasure(category, beta) * count.toDouble / labelCount
+  }.sum
+
+  /**
+   * Returns weighted averaged f1-measure
+   */
+  lazy val weightedFMeasure: Double = labelCountByClass.map { case (category, count) =>
+    fMeasure(category, 1.0) * count.toDouble / labelCount
+  }.sum
+
+  /**
+   * Returns the sequence of labels in ascending order
+   */
+  lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
new file mode 100644
index 0000000000000..0f6d5809e098f
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import java.lang.{Iterable => JavaIterable}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
+
+/**
+ * :: Experimental ::
+ * Maps a sequence of terms to their term frequencies using the hashing trick.
+ *
+ * @param numFeatures number of features (default: 1000000)
+ */
+@Experimental
+class HashingTF(val numFeatures: Int) extends Serializable {
+
+  def this() = this(1000000)
+
+  /**
+   * Returns the index of the input term.
+   */
+  def indexOf(term: Any): Int = Utils.nonNegativeMod(term.##, numFeatures)
+
+  /**
+   * Transforms the input document into a sparse term frequency vector.
+   */
+  def transform(document: Iterable[_]): Vector = {
+    val termFrequencies = mutable.HashMap.empty[Int, Double]
+    document.foreach { term =>
+      val i = indexOf(term)
+      termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0)
+    }
+    Vectors.sparse(numFeatures, termFrequencies.toSeq)
+  }
+
+  /**
+   * Transforms the input document into a sparse term frequency vector (Java version).
+   */
+  def transform(document: JavaIterable[_]): Vector = {
+    transform(document.asScala)
+  }
+
+  /**
+   * Transforms the input document to term frequency vectors.
+   */
+  def transform[D <: Iterable[_]](dataset: RDD[D]): RDD[Vector] = {
+    dataset.map(this.transform)
+  }
+
+  /**
+   * Transforms the input document to term frequency vectors (Java version).
+   */
+  def transform[D <: JavaIterable[_]](dataset: JavaRDD[D]): JavaRDD[Vector] = {
+    dataset.rdd.map(this.transform).toJavaRDD()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
new file mode 100644
index 0000000000000..7ed611a857acc
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import breeze.linalg.{DenseVector => BDV}
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.rdd.RDDFunctions._
+import org.apache.spark.rdd.RDD
+
+/**
+ * :: Experimental ::
+ * Inverse document frequency (IDF).
+ * The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, where `m` is the total
+ * number of documents and `d(t)` is the number of documents that contain term `t`.
+ */
+@Experimental
+class IDF {
+
+  // TODO: Allow different IDF formulations.
+
+  private var brzIdf: BDV[Double] = _
+
+  /**
+   * Computes the inverse document frequency.
+   * @param dataset an RDD of term frequency vectors
+   */
+  def fit(dataset: RDD[Vector]): this.type = {
+    brzIdf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator)(
+      seqOp = (df, v) => df.add(v),
+      combOp = (df1, df2) => df1.merge(df2)
+    ).idf()
+    this
+  }
+
+  /**
+   * Computes the inverse document frequency.
+   * @param dataset a JavaRDD of term frequency vectors
+   */
+  def fit(dataset: JavaRDD[Vector]): this.type = {
+    fit(dataset.rdd)
+  }
+
+  /**
+   * Transforms term frequency (TF) vectors to TF-IDF vectors.
+   * @param dataset an RDD of term frequency vectors
+   * @return an RDD of TF-IDF vectors
+   */
+  def transform(dataset: RDD[Vector]): RDD[Vector] = {
+    if (!initialized) {
+      throw new IllegalStateException("Haven't learned IDF yet. Call fit first.")
+    }
+    val theIdf = brzIdf
+    val bcIdf = dataset.context.broadcast(theIdf)
+    dataset.mapPartitions { iter =>
+      val thisIdf = bcIdf.value
+      iter.map { v =>
+        val n = v.size
+        v match {
+          case sv: SparseVector =>
+            val nnz = sv.indices.size
+            val newValues = new Array[Double](nnz)
+            var k = 0
+            while (k < nnz) {
+              newValues(k) = sv.values(k) * thisIdf(sv.indices(k))
+              k += 1
+            }
+            Vectors.sparse(n, sv.indices, newValues)
+          case dv: DenseVector =>
+            val newValues = new Array[Double](n)
+            var j = 0
+            while (j < n) {
+              newValues(j) = dv.values(j) * thisIdf(j)
+              j += 1
+            }
+            Vectors.dense(newValues)
+          case other =>
+            throw new UnsupportedOperationException(
+              s"Only sparse and dense vectors are supported but got ${other.getClass}.")
+        }
+      }
+    }
+  }
+
+  /**
+   * Transforms term frequency (TF) vectors to TF-IDF vectors (Java version).
+   * @param dataset a JavaRDD of term frequency vectors
+   * @return a JavaRDD of TF-IDF vectors
+   */
+  def transform(dataset: JavaRDD[Vector]): JavaRDD[Vector] = {
+    transform(dataset.rdd).toJavaRDD()
+  }
+
+  /** Returns the IDF vector. */
+  def idf(): Vector = {
+    if (!initialized) {
+      throw new IllegalStateException("Haven't learned IDF yet. Call fit first.")
+    }
+    Vectors.fromBreeze(brzIdf)
+  }
+
+  private def initialized: Boolean = brzIdf != null
+}
+
+private object IDF {
+
+  /** Document frequency aggregator. */
+  class DocumentFrequencyAggregator extends Serializable {
+
+    /** number of documents */
+    private var m = 0L
+    /** document frequency vector */
+    private var df: BDV[Long] = _
+
+    /** Adds a new document. */
+    def add(doc: Vector): this.type = {
+      if (isEmpty) {
+        df = BDV.zeros(doc.size)
+      }
+      doc match {
+        case sv: SparseVector =>
+          val nnz = sv.indices.size
+          var k = 0
+          while (k < nnz) {
+            if (sv.values(k) > 0) {
+              df(sv.indices(k)) += 1L
+            }
+            k += 1
+          }
+        case dv: DenseVector =>
+          val n = dv.size
+          var j = 0
+          while (j < n) {
+            if (dv.values(j) > 0.0) {
+              df(j) += 1L
+            }
+            j += 1
+          }
+        case other =>
+          throw new UnsupportedOperationException(
+            s"Only sparse and dense vectors are supported but got ${other.getClass}.")
+      }
+      m += 1L
+      this
+    }
+
+    /** Merges another. */
+    def merge(other: DocumentFrequencyAggregator): this.type = {
+      if (!other.isEmpty) {
+        m += other.m
+        if (df == null) {
+          df = other.df.copy
+        } else {
+          df += other.df
+        }
+      }
+      this
+    }
+
+    private def isEmpty: Boolean = m == 0L
+
+    /** Returns the current IDF vector. */
+    def idf(): BDV[Double] = {
+      if (isEmpty) {
+        throw new IllegalStateException("Haven't seen any document yet.")
+      }
+      val n = df.length
+      val inv = BDV.zeros[Double](n)
+      var j = 0
+      while (j < n) {
+        inv(j) = math.log((m + 1.0)/ (df(j) + 1.0))
+        j += 1
+      }
+      inv
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
new file mode 100644
index 0000000000000..3515461b52493
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
+import com.github.fommil.netlib.ARPACK
+import org.netlib.util.{intW, doubleW}
+
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Compute eigen-decomposition.
+ */
+@Experimental
+private[mllib] object EigenValueDecomposition {
+  /**
+   * Compute the leading k eigenvalues and eigenvectors on a symmetric square matrix using ARPACK.
+   * The caller needs to ensure that the input matrix is real symmetric. This function requires
+   * memory for `n*(4*k+4)` doubles.
+   *
+   * @param mul a function that multiplies the symmetric matrix with a DenseVector.
+   * @param n dimension of the square matrix (maximum Int.MaxValue).
+   * @param k number of leading eigenvalues required, 0 < k < n.
+   * @param tol tolerance of the eigs computation.
+   * @param maxIterations the maximum number of Arnoldi update iterations.
+   * @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors
+   *         (columns of the matrix).
+   * @note The number of computed eigenvalues might be smaller than k when some Ritz values do not
+   *       satisfy the convergence criterion specified by tol (see ARPACK Users Guide, Chapter 4.6
+   *       for more details). The maximum number of Arnoldi update iterations is set to 300 in this
+   *       function.
+   */
+  private[mllib] def symmetricEigs(
+      mul: BDV[Double] => BDV[Double],
+      n: Int,
+      k: Int,
+      tol: Double,
+      maxIterations: Int): (BDV[Double], BDM[Double]) = {
+    // TODO: remove this function and use eigs in breeze when switching breeze version
+    require(n > k, s"Number of required eigenvalues $k must be smaller than matrix dimension $n")
+
+    val arpack = ARPACK.getInstance()
+
+    // tolerance used in stopping criterion
+    val tolW = new doubleW(tol)
+    // number of desired eigenvalues, 0 < nev < n
+    val nev = new intW(k)
+    // nev Lanczos vectors are generated in the first iteration
+    // ncv-nev Lanczos vectors are generated in each subsequent iteration
+    // ncv must be smaller than n
+    val ncv = math.min(2 * k, n)
+
+    // "I" for standard eigenvalue problem, "G" for generalized eigenvalue problem
+    val bmat = "I"
+    // "LM" : compute the NEV largest (in magnitude) eigenvalues
+    val which = "LM"
+
+    var iparam = new Array[Int](11)
+    // use exact shift in each iteration
+    iparam(0) = 1
+    // maximum number of Arnoldi update iterations, or the actual number of iterations on output
+    iparam(2) = maxIterations
+    // Mode 1: A*x = lambda*x, A symmetric
+    iparam(6) = 1
+
+    var ido = new intW(0)
+    var info = new intW(0)
+    var resid = new Array[Double](n)
+    var v = new Array[Double](n * ncv)
+    var workd = new Array[Double](n * 3)
+    var workl = new Array[Double](ncv * (ncv + 8))
+    var ipntr = new Array[Int](11)
+
+    // call ARPACK's reverse communication, first iteration with ido = 0
+    arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd,
+      workl, workl.length, info)
+
+    val w = BDV(workd)
+
+    // ido = 99 : done flag in reverse communication
+    while (ido.`val` != 99) {
+      if (ido.`val` != -1 && ido.`val` != 1) {
+        throw new IllegalStateException("ARPACK returns ido = " + ido.`val` +
+            " This flag is not compatible with Mode 1: A*x = lambda*x, A symmetric.")
+      }
+      // multiply working vector with the matrix
+      val inputOffset = ipntr(0) - 1
+      val outputOffset = ipntr(1) - 1
+      val x = w.slice(inputOffset, inputOffset + n)
+      val y = w.slice(outputOffset, outputOffset + n)
+      y := mul(x)
+      // call ARPACK's reverse communication
+      arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr,
+        workd, workl, workl.length, info)
+    }
+
+    if (info.`val` != 0) {
+      info.`val` match {
+        case 1 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+            " Maximum number of iterations taken. (Refer ARPACK user guide for details)")
+        case 2 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+            " No shifts could be applied. Try to increase NCV. " +
+            "(Refer ARPACK user guide for details)")
+        case _ => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+            " Please refer ARPACK user guide for error message.")
+      }
+    }
+
+    val d = new Array[Double](nev.`val`)
+    val select = new Array[Boolean](ncv)
+    // copy the Ritz vectors
+    val z = java.util.Arrays.copyOfRange(v, 0, nev.`val` * n)
+
+    // call ARPACK's post-processing for eigenvectors
+    arpack.dseupd(true, "A", select, d, z, n, 0.0, bmat, n, which, nev, tol, resid, ncv, v, n,
+      iparam, ipntr, workd, workl, workl.length, info)
+
+    // number of computed eigenvalues, might be smaller than k
+    val computed = iparam(4)
+
+    val eigenPairs = java.util.Arrays.copyOfRange(d, 0, computed).zipWithIndex.map { r =>
+      (r._1, java.util.Arrays.copyOfRange(z, r._2 * n, r._2 * n + n))
+    }
+
+    // sort the eigen-pairs in descending order
+    val sortedEigenPairs = eigenPairs.sortBy(- _._1)
+
+    // copy eigenvectors in descending order of eigenvalues
+    val sortedU = BDM.zeros[Double](n, computed)
+    sortedEigenPairs.zipWithIndex.foreach { r =>
+      val b = r._2 * n
+      var i = 0
+      while (i < n) {
+        sortedU.data(b + i) = r._1._2(i)
+        i += 1
+      }
+    }
+
+    (BDV[Double](sortedEigenPairs.map(_._1)), sortedU)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index c818a0b9c3e43..77b3e8c714997 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -62,7 +62,7 @@ trait Vector extends Serializable {
    * Gets the value of the ith element.
    * @param i index
    */
-  private[mllib] def apply(i: Int): Double = toBreeze(i)
+  def apply(i: Int): Double = toBreeze(i)
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 695e03b736baf..58c1322757a43 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.mllib.linalg.distributed
 
-import java.util
+import java.util.Arrays
 
-import breeze.linalg.{Vector => BV, DenseMatrix => BDM, DenseVector => BDV, svd => brzSvd}
+import breeze.linalg.{Vector => BV, DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV}
+import breeze.linalg.{svd => brzSvd, axpy => brzAxpy}
 import breeze.numerics.{sqrt => brzSqrt}
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
@@ -27,138 +28,8 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.Logging
-import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
-
-/**
- * Column statistics aggregator implementing
- * [[org.apache.spark.mllib.stat.MultivariateStatisticalSummary]]
- * together with add() and merge() function.
- * A numerically stable algorithm is implemented to compute sample mean and variance:
-  *[[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]].
- * Zero elements (including explicit zero values) are skipped when calling add() and merge(),
- * to have time complexity O(nnz) instead of O(n) for each column.
- */
-private class ColumnStatisticsAggregator(private val n: Int)
-    extends MultivariateStatisticalSummary with Serializable {
-
-  private val currMean: BDV[Double] = BDV.zeros[Double](n)
-  private val currM2n: BDV[Double] = BDV.zeros[Double](n)
-  private var totalCnt = 0.0
-  private val nnz: BDV[Double] = BDV.zeros[Double](n)
-  private val currMax: BDV[Double] = BDV.fill(n)(Double.MinValue)
-  private val currMin: BDV[Double] = BDV.fill(n)(Double.MaxValue)
-
-  override def mean: Vector = {
-    val realMean = BDV.zeros[Double](n)
-    var i = 0
-    while (i < n) {
-      realMean(i) = currMean(i) * nnz(i) / totalCnt
-      i += 1
-    }
-    Vectors.fromBreeze(realMean)
-  }
-
-  override def variance: Vector = {
-    val realVariance = BDV.zeros[Double](n)
-
-    val denominator = totalCnt - 1.0
-
-    // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
-    if (denominator > 0.0) {
-      val deltaMean = currMean
-      var i = 0
-      while (i < currM2n.size) {
-        realVariance(i) =
-          currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt
-        realVariance(i) /= denominator
-        i += 1
-      }
-    }
-
-    Vectors.fromBreeze(realVariance)
-  }
-
-  override def count: Long = totalCnt.toLong
-
-  override def numNonzeros: Vector = Vectors.fromBreeze(nnz)
-
-  override def max: Vector = {
-    var i = 0
-    while (i < n) {
-      if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0
-      i += 1
-    }
-    Vectors.fromBreeze(currMax)
-  }
-
-  override def min: Vector = {
-    var i = 0
-    while (i < n) {
-      if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0
-      i += 1
-    }
-    Vectors.fromBreeze(currMin)
-  }
-
-  /**
-   * Aggregates a row.
-   */
-  def add(currData: BV[Double]): this.type = {
-    currData.activeIterator.foreach {
-      case (_, 0.0) => // Skip explicit zero elements.
-      case (i, value) =>
-        if (currMax(i) < value) {
-          currMax(i) = value
-        }
-        if (currMin(i) > value) {
-          currMin(i) = value
-        }
-
-        val tmpPrevMean = currMean(i)
-        currMean(i) = (currMean(i) * nnz(i) + value) / (nnz(i) + 1.0)
-        currM2n(i) += (value - currMean(i)) * (value - tmpPrevMean)
-
-        nnz(i) += 1.0
-    }
-
-    totalCnt += 1.0
-    this
-  }
-
-  /**
-   * Merges another aggregator.
-   */
-  def merge(other: ColumnStatisticsAggregator): this.type = {
-    require(n == other.n, s"Dimensions mismatch. Expecting $n but got ${other.n}.")
-
-    totalCnt += other.totalCnt
-    val deltaMean = currMean - other.currMean
-
-    var i = 0
-    while (i < n) {
-      // merge mean together
-      if (other.currMean(i) != 0.0) {
-        currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) /
-          (nnz(i) + other.nnz(i))
-      }
-      // merge m2n together
-      if (nnz(i) + other.nnz(i) != 0.0) {
-        currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) /
-          (nnz(i) + other.nnz(i))
-      }
-      if (currMax(i) < other.currMax(i)) {
-        currMax(i) = other.currMax(i)
-      }
-      if (currMin(i) > other.currMin(i)) {
-        currMin(i) = other.currMin(i)
-      }
-      i += 1
-    }
-
-    nnz += other.nnz
-    this
-  }
-}
+import org.apache.spark.mllib.rdd.RDDFunctions._
+import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary}
 
 /**
  * :: Experimental ::
@@ -200,6 +71,30 @@ class RowMatrix(
     nRows
   }
 
+  /**
+   * Multiplies the Gramian matrix `A^T A` by a dense vector on the right without computing `A^T A`.
+   *
+   * @param v a dense vector whose length must match the number of columns of this matrix
+   * @return a dense vector representing the product
+   */
+  private[mllib] def multiplyGramianMatrixBy(v: BDV[Double]): BDV[Double] = {
+    val n = numCols().toInt
+    val vbr = rows.context.broadcast(v)
+    rows.treeAggregate(BDV.zeros[Double](n))(
+      seqOp = (U, r) => {
+        val rBrz = r.toBreeze
+        val a = rBrz.dot(vbr.value)
+        rBrz match {
+          // use specialized axpy for better performance
+          case _: BDV[_] => brzAxpy(a, rBrz.asInstanceOf[BDV[Double]], U)
+          case _: BSV[_] => brzAxpy(a, rBrz.asInstanceOf[BSV[Double]], U)
+          case _ => throw new UnsupportedOperationException(
+            s"Do not support vector operation from type ${rBrz.getClass.getName}.")
+        }
+        U
+      }, combOp = (U1, U2) => U1 += U2)
+  }
+
   /**
    * Computes the Gramian matrix `A^T A`.
    */
@@ -208,62 +103,145 @@ class RowMatrix(
     val nt: Int = n * (n + 1) / 2
 
     // Compute the upper triangular part of the gram matrix.
-    val GU = rows.aggregate(new BDV[Double](new Array[Double](nt)))(
+    val GU = rows.treeAggregate(new BDV[Double](new Array[Double](nt)))(
       seqOp = (U, v) => {
         RowMatrix.dspr(1.0, v, U.data)
         U
-      },
-      combOp = (U1, U2) => U1 += U2
-    )
+      }, combOp = (U1, U2) => U1 += U2)
 
     RowMatrix.triuToFull(n, GU.data)
   }
 
   /**
-   * Computes the singular value decomposition of this matrix.
-   * Denote this matrix by A (m x n), this will compute matrices U, S, V such that A = U * S * V'.
+   * Computes singular value decomposition of this matrix. Denote this matrix by A (m x n). This
+   * will compute matrices U, S, V such that A ~= U * S * V', where S contains the leading k
+   * singular values, U and V contain the corresponding singular vectors.
    *
-   * There is no restriction on m, but we require `n^2` doubles to fit in memory.
-   * Further, n should be less than m.
-
-   * The decomposition is computed by first computing A'A = V S^2 V',
-   * computing svd locally on that (since n x n is small), from which we recover S and V.
-   * Then we compute U via easy matrix multiplication as U =  A * (V * S^-1).
-   * Note that this approach requires `O(n^3)` time on the master node.
+   * At most k largest non-zero singular values and associated vectors are returned. If there are k
+   * such values, then the dimensions of the return will be:
+   *  - U is a RowMatrix of size m x k that satisfies U' * U = eye(k),
+   *  - s is a Vector of size k, holding the singular values in descending order,
+   *  - V is a Matrix of size n x k that satisfies V' * V = eye(k).
    *
-   * At most k largest non-zero singular values and associated vectors are returned.
-   * If there are k such values, then the dimensions of the return will be:
+   * We assume n is smaller than m. The singular values and the right singular vectors are derived
+   * from the eigenvalues and the eigenvectors of the Gramian matrix A' * A. U, the matrix
+   * storing the right singular vectors, is computed via matrix multiplication as
+   * U = A * (V * S^-1^), if requested by user. The actual method to use is determined
+   * automatically based on the cost:
+   *  - If n is small (n < 100) or k is large compared with n (k > n / 2), we compute the Gramian
+   *    matrix first and then compute its top eigenvalues and eigenvectors locally on the driver.
+   *    This requires a single pass with O(n^2^) storage on each executor and on the driver, and
+   *    O(n^2^ k) time on the driver.
+   *  - Otherwise, we compute (A' * A) * v in a distributive way and send it to ARPACK's DSAUPD to
+   *    compute (A' * A)'s top eigenvalues and eigenvectors on the driver node. This requires O(k)
+   *    passes, O(n) storage on each executor, and O(n k) storage on the driver.
    *
-   * U is a RowMatrix of size m x k that satisfies U'U = eye(k),
-   * s is a Vector of size k, holding the singular values in descending order,
-   * and V is a Matrix of size n x k that satisfies V'V = eye(k).
+   * Several internal parameters are set to default values. The reciprocal condition number rCond
+   * is set to 1e-9. All singular values smaller than rCond * sigma(0) are treated as zeros, where
+   * sigma(0) is the largest singular value. The maximum number of Arnoldi update iterations for
+   * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK's
+   * eigen-decomposition is set to 1e-10.
    *
-   * @param k number of singular values to keep. We might return less than k if there are
-   *          numerically zero singular values. See rCond.
+   * @note The conditions that decide which method to use internally and the default parameters are
+   *       subject to change.
+   *
+   * @param k number of leading singular values to keep (0 < k <= n). It might return less than k if
+   *          there are numerically zero singular values or there are not enough Ritz values
+   *          converged before the maximum number of Arnoldi update iterations is reached (in case
+   *          that matrix A is ill-conditioned).
    * @param computeU whether to compute U
    * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
    *              are treated as zero, where sigma(0) is the largest singular value.
-   * @return SingularValueDecomposition(U, s, V)
+   * @return SingularValueDecomposition(U, s, V). U = null if computeU = false.
    */
   def computeSVD(
       k: Int,
       computeU: Boolean = false,
       rCond: Double = 1e-9): SingularValueDecomposition[RowMatrix, Matrix] = {
+    // maximum number of Arnoldi update iterations for invoking ARPACK
+    val maxIter = math.max(300, k * 3)
+    // numerical tolerance for invoking ARPACK
+    val tol = 1e-10
+    computeSVD(k, computeU, rCond, maxIter, tol, "auto")
+  }
+
+  /**
+   * The actual SVD implementation, visible for testing.
+   *
+   * @param k number of leading singular values to keep (0 < k <= n)
+   * @param computeU whether to compute U
+   * @param rCond the reciprocal condition number
+   * @param maxIter max number of iterations (if ARPACK is used)
+   * @param tol termination tolerance (if ARPACK is used)
+   * @param mode computation mode (auto: determine automatically which mode to use,
+   *             local-svd: compute gram matrix and computes its full SVD locally,
+   *             local-eigs: compute gram matrix and computes its top eigenvalues locally,
+   *             dist-eigs: compute the top eigenvalues of the gram matrix distributively)
+   * @return SingularValueDecomposition(U, s, V). U = null if computeU = false.
+   */
+  private[mllib] def computeSVD(
+      k: Int,
+      computeU: Boolean,
+      rCond: Double,
+      maxIter: Int,
+      tol: Double,
+      mode: String): SingularValueDecomposition[RowMatrix, Matrix] = {
     val n = numCols().toInt
-    require(k > 0 && k <= n, s"Request up to n singular values k=$k n=$n.")
+    require(k > 0 && k <= n, s"Request up to n singular values but got k=$k and n=$n.")
+
+    object SVDMode extends Enumeration {
+      val LocalARPACK, LocalLAPACK, DistARPACK = Value
+    }
+
+    val computeMode = mode match {
+      case "auto" =>
+        // TODO: The conditions below are not fully tested.
+        if (n < 100 || k > n / 2) {
+          // If n is small or k is large compared with n, we better compute the Gramian matrix first
+          // and then compute its eigenvalues locally, instead of making multiple passes.
+          if (k < n / 3) {
+            SVDMode.LocalARPACK
+          } else {
+            SVDMode.LocalLAPACK
+          }
+        } else {
+          // If k is small compared with n, we use ARPACK with distributed multiplication.
+          SVDMode.DistARPACK
+        }
+      case "local-svd" => SVDMode.LocalLAPACK
+      case "local-eigs" => SVDMode.LocalARPACK
+      case "dist-eigs" => SVDMode.DistARPACK
+      case _ => throw new IllegalArgumentException(s"Do not support mode $mode.")
+    }
 
-    val G = computeGramianMatrix()
+    // Compute the eigen-decomposition of A' * A.
+    val (sigmaSquares: BDV[Double], u: BDM[Double]) = computeMode match {
+      case SVDMode.LocalARPACK =>
+        require(k < n, s"k must be smaller than n in local-eigs mode but got k=$k and n=$n.")
+        val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
+        EigenValueDecomposition.symmetricEigs(v => G * v, n, k, tol, maxIter)
+      case SVDMode.LocalLAPACK =>
+        val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
+        val (uFull: BDM[Double], sigmaSquaresFull: BDV[Double], _) = brzSvd(G)
+        (sigmaSquaresFull, uFull)
+      case SVDMode.DistARPACK =>
+        require(k < n, s"k must be smaller than n in dist-eigs mode but got k=$k and n=$n.")
+        EigenValueDecomposition.symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIter)
+    }
 
-    // TODO: Use sparse SVD instead.
-    val (u: BDM[Double], sigmaSquares: BDV[Double], v: BDM[Double]) =
-      brzSvd(G.toBreeze.asInstanceOf[BDM[Double]])
     val sigmas: BDV[Double] = brzSqrt(sigmaSquares)
 
-    // Determine effective rank.
+    // Determine the effective rank.
     val sigma0 = sigmas(0)
     val threshold = rCond * sigma0
     var i = 0
-    while (i < k && sigmas(i) >= threshold) {
+    // sigmas might have a length smaller than k, if some Ritz values do not satisfy the convergence
+    // criterion specified by tol after max number of iterations.
+    // Thus use i < min(k, sigmas.length) instead of i < k.
+    if (sigmas.length < k) {
+      logWarning(s"Requested $k singular values but only found ${sigmas.length} converged.")
+    }
+    while (i < math.min(k, sigmas.length) && sigmas(i) >= threshold) {
       i += 1
     }
     val sk = i
@@ -272,12 +250,12 @@ class RowMatrix(
       logWarning(s"Requested $k singular values but only found $sk nonzeros.")
     }
 
-    val s = Vectors.dense(util.Arrays.copyOfRange(sigmas.data, 0, sk))
-    val V = Matrices.dense(n, sk, util.Arrays.copyOfRange(u.data, 0, n * sk))
+    val s = Vectors.dense(Arrays.copyOfRange(sigmas.data, 0, sk))
+    val V = Matrices.dense(n, sk, Arrays.copyOfRange(u.data, 0, n * sk))
 
     if (computeU) {
       // N = Vk * Sk^{-1}
-      val N = new BDM[Double](n, sk, util.Arrays.copyOfRange(u.data, 0, n * sk))
+      val N = new BDM[Double](n, sk, Arrays.copyOfRange(u.data, 0, n * sk))
       var i = 0
       var j = 0
       while (j < sk) {
@@ -309,9 +287,10 @@ class RowMatrix(
         s"We need at least $mem bytes of memory.")
     }
 
-    val (m, mean) = rows.aggregate[(Long, BDV[Double])]((0L, BDV.zeros[Double](n)))(
+    val (m, mean) = rows.treeAggregate[(Long, BDV[Double])]((0L, BDV.zeros[Double](n)))(
       seqOp = (s: (Long, BDV[Double]), v: Vector) => (s._1 + 1L, s._2 += v.toBreeze),
-      combOp = (s1: (Long, BDV[Double]), s2: (Long, BDV[Double])) => (s1._1 + s2._1, s1._2 += s2._2)
+      combOp = (s1: (Long, BDV[Double]), s2: (Long, BDV[Double])) =>
+        (s1._1 + s2._1, s1._2 += s2._2)
     )
 
     updateNumRows(m)
@@ -364,7 +343,7 @@ class RowMatrix(
     if (k == n) {
       Matrices.dense(n, k, u.data)
     } else {
-      Matrices.dense(n, k, util.Arrays.copyOfRange(u.data, 0, n * k))
+      Matrices.dense(n, k, Arrays.copyOfRange(u.data, 0, n * k))
     }
   }
 
@@ -372,11 +351,9 @@ class RowMatrix(
    * Computes column-wise summary statistics.
    */
   def computeColumnSummaryStatistics(): MultivariateStatisticalSummary = {
-    val zeroValue = new ColumnStatisticsAggregator(numCols().toInt)
-    val summary = rows.map(_.toBreeze).aggregate[ColumnStatisticsAggregator](zeroValue)(
+    val summary = rows.treeAggregate(new MultivariateOnlineSummarizer)(
       (aggregator, data) => aggregator.add(data),
-      (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
-    )
+      (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
     updateNumRows(summary.count)
     summary
   }
@@ -390,16 +367,25 @@ class RowMatrix(
    */
   def multiply(B: Matrix): RowMatrix = {
     val n = numCols().toInt
+    val k = B.numCols
     require(n == B.numRows, s"Dimension mismatch: $n vs ${B.numRows}")
 
     require(B.isInstanceOf[DenseMatrix],
       s"Only support dense matrix at this time but found ${B.getClass.getName}.")
 
-    val Bb = rows.context.broadcast(B)
-    val AB = rows.mapPartitions({ iter =>
-      val Bi = Bb.value.toBreeze.asInstanceOf[BDM[Double]]
-      iter.map(v => Vectors.fromBreeze(Bi.t * v.toBreeze))
-    }, preservesPartitioning = true)
+    val Bb = rows.context.broadcast(B.toBreeze.asInstanceOf[BDM[Double]].toDenseVector.toArray)
+    val AB = rows.mapPartitions { iter =>
+      val Bi = Bb.value
+      iter.map { row =>
+        val v = BDV.zeros[Double](k)
+        var i = 0
+        while (i < k) {
+          v(i) = row.toBreeze.dot(new BDV(Bi, i * n, 1, n))
+          i += 1
+        }
+        Vectors.fromBreeze(v)
+      }
+    }
 
     new RowMatrix(AB, nRows, B.numCols)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 679842f831c2a..9d82f011e674a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -68,9 +68,9 @@ class LogisticGradient extends Gradient {
     val gradient = brzData * gradientMultiplier
     val loss =
       if (label > 0) {
-        math.log(1 + math.exp(margin))
+        math.log1p(math.exp(margin)) // log1p is log(1+p) but more accurate for small p
       } else {
-        math.log(1 + math.exp(margin)) - margin
+        math.log1p(math.exp(margin)) - margin
       }
 
     (Vectors.fromBreeze(gradient), loss)
@@ -89,9 +89,9 @@ class LogisticGradient extends Gradient {
     brzAxpy(gradientMultiplier, brzData, cumGradient.toBreeze)
 
     if (label > 0) {
-      math.log(1 + math.exp(margin))
+      math.log1p(math.exp(margin))
     } else {
-      math.log(1 + math.exp(margin)) - margin
+      math.log1p(math.exp(margin)) - margin
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 7030eeabe400a..a6912056395d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -25,6 +25,7 @@ import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.rdd.RDDFunctions._
 
 /**
  * Class used to solve an optimization problem using Gradient Descent.
@@ -161,8 +162,17 @@ object GradientDescent extends Logging {
     val numExamples = data.count()
     val miniBatchSize = numExamples * miniBatchFraction
 
+    // if no data, return initial weights to avoid NaNs
+    if (numExamples == 0) {
+
+      logInfo("GradientDescent.runMiniBatchSGD returning initial weights, no data found")
+      return (initialWeights, stochasticLossHistory.toArray)
+
+    }
+
     // Initialize weights as a column vector
     var weights = Vectors.dense(initialWeights.toArray)
+    val n = weights.size
 
     /**
      * For the first iteration, the regVal will be initialized as sum of weight squares
@@ -172,12 +182,13 @@ object GradientDescent extends Logging {
       weights, Vectors.dense(new Array[Double](weights.size)), 0, 1, regParam)._2
 
     for (i <- 1 to numIterations) {
+      val bcWeights = data.context.broadcast(weights)
       // Sample a subset (fraction miniBatchFraction) of the total data
       // compute and sum up the subgradients on this subset (this is one map-reduce)
       val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42 + i)
-        .aggregate((BDV.zeros[Double](weights.size), 0.0))(
+        .treeAggregate((BDV.zeros[Double](n), 0.0))(
           seqOp = (c, v) => (c, v) match { case ((grad, loss), (label, features)) =>
-            val l = gradient.compute(features, label, weights, Vectors.fromBreeze(grad))
+            val l = gradient.compute(features, label, bcWeights.value, Vectors.fromBreeze(grad))
             (grad, loss + l)
           },
           combOp = (c1, c2) => (c1, c2) match { case ((grad1, loss1), (grad2, loss2)) =>
@@ -199,5 +210,6 @@ object GradientDescent extends Logging {
       stochasticLossHistory.takeRight(10).mkString(", ")))
 
     (weights, stochasticLossHistory.toArray)
+
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index 7bbed9c8fdbef..26a2b62e76ed0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -26,6 +26,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.rdd.RDDFunctions._
 
 /**
  * :: DeveloperApi ::
@@ -195,13 +196,14 @@ object LBFGS extends Logging {
 
     override def calculate(weights: BDV[Double]) = {
       // Have a local copy to avoid the serialization of CostFun object which is not serializable.
-      val localData = data
       val localGradient = gradient
+      val n = weights.length
+      val bcWeights = data.context.broadcast(weights)
 
-      val (gradientSum, lossSum) = localData.aggregate((BDV.zeros[Double](weights.size), 0.0))(
+      val (gradientSum, lossSum) = data.treeAggregate((BDV.zeros[Double](n), 0.0))(
           seqOp = (c, v) => (c, v) match { case ((grad, loss), (label, features)) =>
             val l = localGradient.compute(
-              features, label, Vectors.fromBreeze(weights), Vectors.fromBreeze(grad))
+              features, label, Vectors.fromBreeze(bcWeights.value), Vectors.fromBreeze(grad))
             (grad, loss + l)
           },
           combOp = (c1, c2) => (c1, c2) match { case ((grad1, loss1), (grad2, loss2)) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
new file mode 100644
index 0000000000000..9cab49f6ed1f0
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.random
+
+import cern.jet.random.Poisson
+import cern.jet.random.engine.DRand
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
+
+/**
+ * :: Experimental ::
+ * Trait for random data generators that generate i.i.d. data.
+ */
+@Experimental
+trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
+
+  /**
+   * Returns an i.i.d. sample as a generic type from an underlying distribution.
+   */
+  def nextValue(): T
+
+  /**
+   * Returns a copy of the RandomDataGenerator with a new instance of the rng object used in the
+   * class when applicable for non-locking concurrent usage.
+   */
+  def copy(): RandomDataGenerator[T]
+}
+
+/**
+ * :: Experimental ::
+ * Generates i.i.d. samples from U[0.0, 1.0]
+ */
+@Experimental
+class UniformGenerator extends RandomDataGenerator[Double] {
+
+  // XORShiftRandom for better performance. Thread safety isn't necessary here.
+  private val random = new XORShiftRandom()
+
+  override def nextValue(): Double = {
+    random.nextDouble()
+  }
+
+  override def setSeed(seed: Long) = random.setSeed(seed)
+
+  override def copy(): UniformGenerator = new UniformGenerator()
+}
+
+/**
+ * :: Experimental ::
+ * Generates i.i.d. samples from the standard normal distribution.
+ */
+@Experimental
+class StandardNormalGenerator extends RandomDataGenerator[Double] {
+
+  // XORShiftRandom for better performance. Thread safety isn't necessary here.
+  private val random = new XORShiftRandom()
+
+  override def nextValue(): Double = {
+      random.nextGaussian()
+  }
+
+  override def setSeed(seed: Long) = random.setSeed(seed)
+
+  override def copy(): StandardNormalGenerator = new StandardNormalGenerator()
+}
+
+/**
+ * :: Experimental ::
+ * Generates i.i.d. samples from the Poisson distribution with the given mean.
+ *
+ * @param mean mean for the Poisson distribution.
+ */
+@Experimental
+class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
+
+  private var rng = new Poisson(mean, new DRand)
+
+  override def nextValue(): Double = rng.nextDouble()
+
+  override def setSeed(seed: Long) {
+    rng = new Poisson(mean, new DRand(seed.toInt))
+  }
+
+  override def copy(): PoissonGenerator = new PoissonGenerator(mean)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDGenerators.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDGenerators.scala
new file mode 100644
index 0000000000000..b0a0593223910
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDGenerators.scala
@@ -0,0 +1,493 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.random
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.rdd.{RandomVectorRDD, RandomRDD}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
+
+import scala.reflect.ClassTag
+
+/**
+ * :: Experimental ::
+ * Generator methods for creating RDDs comprised of i.i.d. samples from some distribution.
+ */
+@Experimental
+object RandomRDDGenerators {
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the uniform distribution on [0.0, 1.0].
+   *
+   * To transform the distribution in the generated RDD from U[0.0, 1.0] to U[a, b], use
+   * `RandomRDDGenerators.uniformRDD(sc, n, p, seed).map(v => a + (b - a) * v)`.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Double] comprised of i.i.d. samples ~ U[0.0, 1.0].
+   */
+  @Experimental
+  def uniformRDD(sc: SparkContext, size: Long, numPartitions: Int, seed: Long): RDD[Double] = {
+    val uniform = new UniformGenerator()
+    randomRDD(sc, uniform,  size, numPartitions, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the uniform distribution on [0.0, 1.0].
+   *
+   * To transform the distribution in the generated RDD from U[0.0, 1.0] to U[a, b], use
+   * `RandomRDDGenerators.uniformRDD(sc, n, p).map(v => a + (b - a) * v)`.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples ~ U[0.0, 1.0].
+   */
+  @Experimental
+  def uniformRDD(sc: SparkContext, size: Long, numPartitions: Int): RDD[Double] = {
+    uniformRDD(sc, size, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the uniform distribution on [0.0, 1.0].
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * To transform the distribution in the generated RDD from U[0.0, 1.0] to U[a, b], use
+   * `RandomRDDGenerators.uniformRDD(sc, n).map(v => a + (b - a) * v)`.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param size Size of the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples ~ U[0.0, 1.0].
+   */
+  @Experimental
+  def uniformRDD(sc: SparkContext, size: Long): RDD[Double] = {
+    uniformRDD(sc, size, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the standard normal distribution.
+   *
+   * To transform the distribution in the generated RDD from standard normal to some other normal
+   * N(mean, sigma), use `RandomRDDGenerators.normalRDD(sc, n, p, seed).map(v => mean + sigma * v)`.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Double] comprised of i.i.d. samples ~ N(0.0, 1.0).
+   */
+  @Experimental
+  def normalRDD(sc: SparkContext, size: Long, numPartitions: Int, seed: Long): RDD[Double] = {
+    val normal = new StandardNormalGenerator()
+    randomRDD(sc, normal, size, numPartitions, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the standard normal distribution.
+   *
+   * To transform the distribution in the generated RDD from standard normal to some other normal
+   * N(mean, sigma), use `RandomRDDGenerators.normalRDD(sc, n, p).map(v => mean + sigma * v)`.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples ~ N(0.0, 1.0).
+   */
+  @Experimental
+  def normalRDD(sc: SparkContext, size: Long, numPartitions: Int): RDD[Double] = {
+    normalRDD(sc, size, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the standard normal distribution.
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * To transform the distribution in the generated RDD from standard normal to some other normal
+   * N(mean, sigma), use `RandomRDDGenerators.normalRDD(sc, n).map(v => mean + sigma * v)`.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param size Size of the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples ~ N(0.0, 1.0).
+   */
+  @Experimental
+  def normalRDD(sc: SparkContext, size: Long): RDD[Double] = {
+    normalRDD(sc, size, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or lambda, for the Poisson distribution.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   */
+  @Experimental
+  def poissonRDD(sc: SparkContext,
+      mean: Double,
+      size: Long,
+      numPartitions: Int,
+      seed: Long): RDD[Double] = {
+    val poisson = new PoissonGenerator(mean)
+    randomRDD(sc, poisson, size, numPartitions, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or lambda, for the Poisson distribution.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   */
+  @Experimental
+  def poissonRDD(sc: SparkContext, mean: Double, size: Long, numPartitions: Int): RDD[Double] = {
+    poissonRDD(sc, mean, size, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean.
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or lambda, for the Poisson distribution.
+   * @param size Size of the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   */
+  @Experimental
+  def poissonRDD(sc: SparkContext, mean: Double, size: Long): RDD[Double] = {
+    poissonRDD(sc, mean, size, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples produced by the input DistributionGenerator.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param generator DistributionGenerator used to populate the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Double] comprised of i.i.d. samples produced by generator.
+   */
+  @Experimental
+  def randomRDD[T: ClassTag](sc: SparkContext,
+      generator: RandomDataGenerator[T],
+      size: Long,
+      numPartitions: Int,
+      seed: Long): RDD[T] = {
+    new RandomRDD[T](sc, size, numPartitions, generator, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples produced by the input DistributionGenerator.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param generator DistributionGenerator used to populate the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples produced by generator.
+   */
+  @Experimental
+  def randomRDD[T: ClassTag](sc: SparkContext,
+      generator: RandomDataGenerator[T],
+      size: Long,
+      numPartitions: Int): RDD[T] = {
+    randomRDD[T](sc, generator, size, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD comprised of i.i.d. samples produced by the input DistributionGenerator.
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param generator DistributionGenerator used to populate the RDD.
+   * @param size Size of the RDD.
+   * @return RDD[Double] comprised of i.i.d. samples produced by generator.
+   */
+  @Experimental
+  def randomRDD[T: ClassTag](sc: SparkContext,
+      generator: RandomDataGenerator[T],
+      size: Long): RDD[T] = {
+    randomRDD[T](sc, generator, size, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  // TODO Generate RDD[Vector] from multivariate distributions.
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * uniform distribution on [0.0 1.0].
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Vector] with vectors containing i.i.d samples ~ U[0.0, 1.0].
+   */
+  @Experimental
+  def uniformVectorRDD(sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): RDD[Vector] = {
+    val uniform = new UniformGenerator()
+    randomVectorRDD(sc, uniform, numRows, numCols, numPartitions, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * uniform distribution on [0.0 1.0].
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ U[0.0, 1.0].
+   */
+  @Experimental
+  def uniformVectorRDD(sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): RDD[Vector] = {
+    uniformVectorRDD(sc, numRows, numCols, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * uniform distribution on [0.0 1.0].
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ U[0.0, 1.0].
+   */
+  @Experimental
+  def uniformVectorRDD(sc: SparkContext, numRows: Long, numCols: Int): RDD[Vector] = {
+    uniformVectorRDD(sc, numRows, numCols, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * standard normal distribution.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ N(0.0, 1.0).
+   */
+  @Experimental
+  def normalVectorRDD(sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): RDD[Vector] = {
+    val uniform = new StandardNormalGenerator()
+    randomVectorRDD(sc, uniform, numRows, numCols, numPartitions, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * standard normal distribution.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ N(0.0, 1.0).
+   */
+  @Experimental
+  def normalVectorRDD(sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): RDD[Vector] = {
+    normalVectorRDD(sc, numRows, numCols, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * standard normal distribution.
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ N(0.0, 1.0).
+   */
+  @Experimental
+  def normalVectorRDD(sc: SparkContext, numRows: Long, numCols: Int): RDD[Vector] = {
+    normalVectorRDD(sc, numRows, numCols, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Poisson distribution with the input mean.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or lambda, for the Poisson distribution.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Pois(mean).
+   */
+  @Experimental
+  def poissonVectorRDD(sc: SparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): RDD[Vector] = {
+    val poisson = new PoissonGenerator(mean)
+    randomVectorRDD(sc, poisson, numRows, numCols, numPartitions, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Poisson distribution with the input mean.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or lambda, for the Poisson distribution.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Pois(mean).
+   */
+  @Experimental
+  def poissonVectorRDD(sc: SparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): RDD[Vector] = {
+    poissonVectorRDD(sc, mean, numRows, numCols, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Poisson distribution with the input mean.
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param mean Mean, or lambda, for the Poisson distribution.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Pois(mean).
+   */
+  @Experimental
+  def poissonVectorRDD(sc: SparkContext,
+      mean: Double,
+      numRows: Long,
+      numCols: Int): RDD[Vector] = {
+    poissonVectorRDD(sc, mean, numRows, numCols, sc.defaultParallelism, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples produced by the
+   * input DistributionGenerator.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param generator DistributionGenerator used to populate the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
+   * @return RDD[Vector] with vectors containing i.i.d. samples produced by generator.
+   */
+  @Experimental
+  def randomVectorRDD(sc: SparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): RDD[Vector] = {
+    new RandomVectorRDD(sc, numRows, numCols, numPartitions, generator, seed)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples produced by the
+   * input DistributionGenerator.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param generator DistributionGenerator used to populate the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @param numPartitions Number of partitions in the RDD.
+   * @return RDD[Vector] with vectors containing i.i.d. samples produced by generator.
+   */
+  @Experimental
+  def randomVectorRDD(sc: SparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): RDD[Vector] = {
+    randomVectorRDD(sc, generator, numRows, numCols, numPartitions, Utils.random.nextLong)
+  }
+
+  /**
+   * :: Experimental ::
+   * Generates an RDD[Vector] with vectors containing i.i.d. samples produced by the
+   * input DistributionGenerator.
+   * sc.defaultParallelism used for the number of partitions in the RDD.
+   *
+   * @param sc SparkContext used to create the RDD.
+   * @param generator DistributionGenerator used to populate the RDD.
+   * @param numRows Number of Vectors in the RDD.
+   * @param numCols Number of elements in each Vector.
+   * @return RDD[Vector] with vectors containing i.i.d. samples produced by generator.
+   */
+  @Experimental
+  def randomVectorRDD(sc: SparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int): RDD[Vector] = {
+    randomVectorRDD(sc, generator, numRows, numCols,
+      sc.defaultParallelism, Utils.random.nextLong)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
index 365b5e75d7f75..b5e403bc8c14d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -20,7 +20,10 @@ package org.apache.spark.mllib.rdd
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
+import org.apache.spark.HashPartitioner
+import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
 
 /**
  * Machine learning specific RDD functions.
@@ -44,6 +47,69 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) {
       new SlidingRDD[T](self, windowSize)
     }
   }
+
+  /**
+   * Reduces the elements of this RDD in a multi-level tree pattern.
+   *
+   * @param depth suggested depth of the tree (default: 2)
+   * @see [[org.apache.spark.rdd.RDD#reduce]]
+   */
+  def treeReduce(f: (T, T) => T, depth: Int = 2): T = {
+    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")
+    val cleanF = self.context.clean(f)
+    val reducePartition: Iterator[T] => Option[T] = iter => {
+      if (iter.hasNext) {
+        Some(iter.reduceLeft(cleanF))
+      } else {
+        None
+      }
+    }
+    val partiallyReduced = self.mapPartitions(it => Iterator(reducePartition(it)))
+    val op: (Option[T], Option[T]) => Option[T] = (c, x) => {
+      if (c.isDefined && x.isDefined) {
+        Some(cleanF(c.get, x.get))
+      } else if (c.isDefined) {
+        c
+      } else if (x.isDefined) {
+        x
+      } else {
+        None
+      }
+    }
+    RDDFunctions.fromRDD(partiallyReduced).treeAggregate(Option.empty[T])(op, op, depth)
+      .getOrElse(throw new UnsupportedOperationException("empty collection"))
+  }
+
+  /**
+   * Aggregates the elements of this RDD in a multi-level tree pattern.
+   *
+   * @param depth suggested depth of the tree (default: 2)
+   * @see [[org.apache.spark.rdd.RDD#aggregate]]
+   */
+  def treeAggregate[U: ClassTag](zeroValue: U)(
+      seqOp: (U, T) => U,
+      combOp: (U, U) => U,
+      depth: Int = 2): U = {
+    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")
+    if (self.partitions.size == 0) {
+      return Utils.clone(zeroValue, self.context.env.closureSerializer.newInstance())
+    }
+    val cleanSeqOp = self.context.clean(seqOp)
+    val cleanCombOp = self.context.clean(combOp)
+    val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp)
+    var partiallyAggregated = self.mapPartitions(it => Iterator(aggregatePartition(it)))
+    var numPartitions = partiallyAggregated.partitions.size
+    val scale = math.max(math.ceil(math.pow(numPartitions, 1.0 / depth)).toInt, 2)
+    // If creating an extra level doesn't help reduce the wall-clock time, we stop tree aggregation.
+    while (numPartitions > scale + numPartitions / scale) {
+      numPartitions /= scale
+      val curNumPartitions = numPartitions
+      partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex { (i, iter) =>
+        iter.map((i % curNumPartitions, _))
+      }.reduceByKey(new HashPartitioner(curNumPartitions), cleanCombOp).values
+    }
+    partiallyAggregated.reduce(cleanCombOp)
+  }
 }
 
 private[mllib]
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
new file mode 100644
index 0000000000000..c8db3910c6eab
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.rdd
+
+import org.apache.spark.{Partition, SparkContext, TaskContext}
+import org.apache.spark.mllib.linalg.{DenseVector, Vector}
+import org.apache.spark.mllib.random.RandomDataGenerator
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
+
+import scala.reflect.ClassTag
+import scala.util.Random
+
+private[mllib] class RandomRDDPartition[T](override val index: Int,
+    val size: Int,
+    val generator: RandomDataGenerator[T],
+    val seed: Long) extends Partition {
+
+  require(size >= 0, "Non-negative partition size required.")
+}
+
+// These two classes are necessary since Range objects in Scala cannot have size > Int.MaxValue
+private[mllib] class RandomRDD[T: ClassTag](@transient sc: SparkContext,
+    size: Long,
+    numPartitions: Int,
+    @transient rng: RandomDataGenerator[T],
+    @transient seed: Long = Utils.random.nextLong) extends RDD[T](sc, Nil) {
+
+  require(size > 0, "Positive RDD size required.")
+  require(numPartitions > 0, "Positive number of partitions required")
+  require(math.ceil(size.toDouble / numPartitions) <= Int.MaxValue,
+    "Partition size cannot exceed Int.MaxValue")
+
+  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
+    val split = splitIn.asInstanceOf[RandomRDDPartition[T]]
+    RandomRDD.getPointIterator[T](split)
+  }
+
+  override def getPartitions: Array[Partition] = {
+    RandomRDD.getPartitions(size, numPartitions, rng, seed)
+  }
+}
+
+private[mllib] class RandomVectorRDD(@transient sc: SparkContext,
+    size: Long,
+    vectorSize: Int,
+    numPartitions: Int,
+    @transient rng: RandomDataGenerator[Double],
+    @transient seed: Long = Utils.random.nextLong) extends RDD[Vector](sc, Nil) {
+
+  require(size > 0, "Positive RDD size required.")
+  require(numPartitions > 0, "Positive number of partitions required")
+  require(vectorSize > 0, "Positive vector size required.")
+  require(math.ceil(size.toDouble / numPartitions) <= Int.MaxValue,
+    "Partition size cannot exceed Int.MaxValue")
+
+  override def compute(splitIn: Partition, context: TaskContext): Iterator[Vector] = {
+    val split = splitIn.asInstanceOf[RandomRDDPartition[Double]]
+    RandomRDD.getVectorIterator(split, vectorSize)
+  }
+
+  override protected def getPartitions: Array[Partition] = {
+    RandomRDD.getPartitions(size, numPartitions, rng, seed)
+  }
+}
+
+private[mllib] object RandomRDD {
+
+  def getPartitions[T](size: Long,
+      numPartitions: Int,
+      rng: RandomDataGenerator[T],
+      seed: Long): Array[Partition] = {
+
+    val partitions = new Array[RandomRDDPartition[T]](numPartitions)
+    var i = 0
+    var start: Long = 0
+    var end: Long = 0
+    val random = new Random(seed)
+    while (i < numPartitions) {
+      end = ((i + 1) * size) / numPartitions
+      partitions(i) = new RandomRDDPartition(i, (end - start).toInt, rng, random.nextLong())
+      start = end
+      i += 1
+    }
+    partitions.asInstanceOf[Array[Partition]]
+  }
+
+  // The RNG has to be reset every time the iterator is requested to guarantee same data
+  // every time the content of the RDD is examined.
+  def getPointIterator[T: ClassTag](partition: RandomRDDPartition[T]): Iterator[T] = {
+    val generator = partition.generator.copy()
+    generator.setSeed(partition.seed)
+    Array.fill(partition.size)(generator.nextValue()).toIterator
+  }
+
+  // The RNG has to be reset every time the iterator is requested to guarantee same data
+  // every time the content of the RDD is examined.
+  def getVectorIterator(partition: RandomRDDPartition[Double],
+                        vectorSize: Int): Iterator[Vector] = {
+    val generator = partition.generator.copy()
+    generator.setSeed(partition.seed)
+    Array.fill(partition.size)(new DenseVector(
+      (0 until vectorSize).map { _ => generator.nextValue() }.toArray)).toIterator
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index cc56fd6ef28d6..8ebc7e27ed4dd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.mllib.recommendation
 
-import scala.collection.mutable.{ArrayBuffer, BitSet}
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 import scala.math.{abs, sqrt}
 import scala.util.Random
 import scala.util.Sorting
@@ -25,7 +26,7 @@ import scala.util.hashing.byteswap32
 
 import org.jblas.{DoubleMatrix, SimpleBlas, Solve}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.{Logging, HashPartitioner, Partitioner}
 import org.apache.spark.storage.StorageLevel
@@ -39,7 +40,8 @@ import org.apache.spark.mllib.optimization.NNLS
  * of the elements within this block, and the list of destination blocks that each user or
  * product will need to send its feature vector to.
  */
-private[recommendation] case class OutLinkBlock(elementIds: Array[Int], shouldSend: Array[BitSet])
+private[recommendation]
+case class OutLinkBlock(elementIds: Array[Int], shouldSend: Array[mutable.BitSet])
 
 
 /**
@@ -252,14 +254,17 @@ class ALS private (
         val YtY = Some(sc.broadcast(computeYtY(users)))
         val previousProducts = products
         products = updateFeatures(numProductBlocks, users, userOutLinks, productInLinks,
-          userPartitioner, rank, lambda, alpha, YtY)
+          rank, lambda, alpha, YtY)
         previousProducts.unpersist()
         logInfo("Re-computing U given I (Iteration %d/%d)".format(iter, iterations))
+        if (sc.checkpointDir.isDefined && (iter % 3 == 0)) {
+          products.checkpoint()
+        }
         products.setName(s"products-$iter").persist()
         val XtX = Some(sc.broadcast(computeYtY(products)))
         val previousUsers = users
         users = updateFeatures(numUserBlocks, products, productOutLinks, userInLinks,
-          productPartitioner, rank, lambda, alpha, XtX)
+          rank, lambda, alpha, XtX)
         previousUsers.unpersist()
       }
     } else {
@@ -267,11 +272,14 @@ class ALS private (
         // perform ALS update
         logInfo("Re-computing I given U (Iteration %d/%d)".format(iter, iterations))
         products = updateFeatures(numProductBlocks, users, userOutLinks, productInLinks,
-          userPartitioner, rank, lambda, alpha, YtY = None)
+          rank, lambda, alpha, YtY = None)
+        if (sc.checkpointDir.isDefined && (iter % 3 == 0)) {
+          products.checkpoint()
+        }
         products.setName(s"products-$iter")
         logInfo("Re-computing U given I (Iteration %d/%d)".format(iter, iterations))
         users = updateFeatures(numUserBlocks, products, productOutLinks, userInLinks,
-          productPartitioner, rank, lambda, alpha, YtY = None)
+          rank, lambda, alpha, YtY = None)
         users.setName(s"users-$iter")
       }
     }
@@ -284,8 +292,8 @@ class ALS private (
     val usersOut = unblockFactors(users, userOutLinks)
     val productsOut = unblockFactors(products, productOutLinks)
 
-    usersOut.setName("usersOut").persist()
-    productsOut.setName("productsOut").persist()
+    usersOut.setName("usersOut").persist(StorageLevel.MEMORY_AND_DISK)
+    productsOut.setName("productsOut").persist(StorageLevel.MEMORY_AND_DISK)
 
     // Materialize usersOut and productsOut.
     usersOut.count()
@@ -376,7 +384,7 @@ class ALS private (
     val userIds = ratings.map(_.user).distinct.sorted
     val numUsers = userIds.length
     val userIdToPos = userIds.zipWithIndex.toMap
-    val shouldSend = Array.fill(numUsers)(new BitSet(numProductBlocks))
+    val shouldSend = Array.fill(numUsers)(new mutable.BitSet(numProductBlocks))
     for (r <- ratings) {
       shouldSend(userIdToPos(r.user))(productPartitioner.getPartition(r.product)) = true
     }
@@ -430,7 +438,7 @@ class ALS private (
       val inLinkBlock = makeInLinkBlock(numProductBlocks, ratings, productPartitioner)
       val outLinkBlock = makeOutLinkBlock(numProductBlocks, ratings, productPartitioner)
       Iterator.single((blockId, (inLinkBlock, outLinkBlock)))
-    }, true)
+    }, preservesPartitioning = true)
     val inLinks = links.mapValues(_._1)
     val outLinks = links.mapValues(_._2)
     inLinks.persist(StorageLevel.MEMORY_AND_DISK)
@@ -464,7 +472,6 @@ class ALS private (
       products: RDD[(Int, Array[Array[Double]])],
       productOutLinks: RDD[(Int, OutLinkBlock)],
       userInLinks: RDD[(Int, InLinkBlock)],
-      productPartitioner: Partitioner,
       rank: Int,
       lambda: Double,
       alpha: Double,
@@ -477,7 +484,7 @@ class ALS private (
           }
         }
         toSend.zipWithIndex.map{ case (buf, idx) => (idx, (bid, buf.toArray)) }
-    }.groupByKey(productPartitioner)
+    }.groupByKey(new HashPartitioner(numUserBlocks))
      .join(userInLinks)
      .mapValues{ case (messages, inLinkBlock) =>
         updateBlock(messages, inLinkBlock, rank, lambda, alpha, YtY)
@@ -792,4 +799,120 @@ object ALS {
     : MatrixFactorizationModel = {
     trainImplicit(ratings, rank, iterations, 0.01, -1, 1.0)
   }
+
+  /**
+   * :: DeveloperApi ::
+   * Statistics of a block in ALS computation.
+   *
+   * @param category type of this block, "user" or "product"
+   * @param index index of this block
+   * @param count number of users or products inside this block, the same as the number of
+   *              least-squares problems to solve on this block in each iteration
+   * @param numRatings total number of ratings inside this block, the same as the number of outer
+   *                   products we need to make on this block in each iteration
+   * @param numInLinks total number of incoming links, the same as the number of vectors to retrieve
+   *                   before each iteration
+   * @param numOutLinks total number of outgoing links, the same as the number of vectors to send
+   *                    for the next iteration
+   */
+  @DeveloperApi
+  case class BlockStats(
+      category: String,
+      index: Int,
+      count: Long,
+      numRatings: Long,
+      numInLinks: Long,
+      numOutLinks: Long)
+
+  /**
+   * :: DeveloperApi ::
+   * Given an RDD of ratings, number of user blocks, and number of product blocks, computes the
+   * statistics of each block in ALS computation. This is useful for estimating cost and diagnosing
+   * load balance.
+   *
+   * @param ratings an RDD of ratings
+   * @param numUserBlocks number of user blocks
+   * @param numProductBlocks number of product blocks
+   * @return statistics of user blocks and product blocks
+   */
+  @DeveloperApi
+  def analyzeBlocks(
+      ratings: RDD[Rating],
+      numUserBlocks: Int,
+      numProductBlocks: Int): Array[BlockStats] = {
+
+    val userPartitioner = new ALSPartitioner(numUserBlocks)
+    val productPartitioner = new ALSPartitioner(numProductBlocks)
+
+    val ratingsByUserBlock = ratings.map { rating =>
+      (userPartitioner.getPartition(rating.user), rating)
+    }
+    val ratingsByProductBlock = ratings.map { rating =>
+      (productPartitioner.getPartition(rating.product),
+        Rating(rating.product, rating.user, rating.rating))
+    }
+
+    val als = new ALS()
+    val (userIn, userOut) =
+      als.makeLinkRDDs(numUserBlocks, numProductBlocks, ratingsByUserBlock, userPartitioner)
+    val (prodIn, prodOut) =
+      als.makeLinkRDDs(numProductBlocks, numUserBlocks, ratingsByProductBlock, productPartitioner)
+
+    def sendGrid(outLinks: RDD[(Int, OutLinkBlock)]): Map[(Int, Int), Long] = {
+      outLinks.map { x =>
+        val grid = new mutable.HashMap[(Int, Int), Long]()
+        val uPartition = x._1
+        x._2.shouldSend.foreach { ss =>
+          ss.foreach { pPartition =>
+            val pair = (uPartition, pPartition)
+            grid.put(pair, grid.getOrElse(pair, 0L) + 1L)
+          }
+        }
+        grid
+      }.reduce { (grid1, grid2) =>
+        grid2.foreach { x =>
+          grid1.put(x._1, grid1.getOrElse(x._1, 0L) + x._2)
+        }
+        grid1
+      }.toMap
+    }
+
+    val userSendGrid = sendGrid(userOut)
+    val prodSendGrid = sendGrid(prodOut)
+
+    val userInbound = new Array[Long](numUserBlocks)
+    val prodInbound = new Array[Long](numProductBlocks)
+    val userOutbound = new Array[Long](numUserBlocks)
+    val prodOutbound = new Array[Long](numProductBlocks)
+
+    for (u <- 0 until numUserBlocks; p <- 0 until numProductBlocks) {
+      userOutbound(u) += userSendGrid.getOrElse((u, p), 0L)
+      prodInbound(p) += userSendGrid.getOrElse((u, p), 0L)
+      userInbound(u) += prodSendGrid.getOrElse((p, u), 0L)
+      prodOutbound(p) += prodSendGrid.getOrElse((p, u), 0L)
+    }
+
+    val userCounts = userOut.mapValues(x => x.elementIds.length).collectAsMap()
+    val prodCounts = prodOut.mapValues(x => x.elementIds.length).collectAsMap()
+
+    val userRatings = countRatings(userIn)
+    val prodRatings = countRatings(prodIn)
+
+    val userStats = Array.tabulate(numUserBlocks)(
+      u => BlockStats("user", u, userCounts(u), userRatings(u), userInbound(u), userOutbound(u)))
+    val productStatus = Array.tabulate(numProductBlocks)(
+      p => BlockStats("product", p, prodCounts(p), prodRatings(p), prodInbound(p), prodOutbound(p)))
+
+    (userStats ++ productStatus).toArray
+  }
+
+  private def countRatings(inLinks: RDD[(Int, InLinkBlock)]): Map[Int, Long] = {
+    inLinks.mapValues { ilb =>
+      var numRatings = 0L
+      ilb.ratingsForBlock.foreach { ar =>
+        ar.foreach { p => numRatings += p._1.length }
+      }
+      numRatings
+    }.collectAsMap().toMap
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 899286d235a9d..a1a76fcbe9f9c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -65,6 +65,48 @@ class MatrixFactorizationModel private[mllib] (
     }
   }
 
+  /**
+   * Recommends products to a user.
+   *
+   * @param user the user to recommend products to
+   * @param num how many products to return. The number returned may be less than this.
+   * @return [[Rating]] objects, each of which contains the given user ID, a product ID, and a
+   *  "score" in the rating field. Each represents one recommended product, and they are sorted
+   *  by score, decreasing. The first returned is the one predicted to be most strongly
+   *  recommended to the user. The score is an opaque value that indicates how strongly
+   *  recommended the product is.
+   */
+  def recommendProducts(user: Int, num: Int): Array[Rating] =
+    recommend(userFeatures.lookup(user).head, productFeatures, num)
+      .map(t => Rating(user, t._1, t._2))
+
+  /**
+   * Recommends users to a product. That is, this returns users who are most likely to be
+   * interested in a product.
+   *
+   * @param product the product to recommend users to
+   * @param num how many users to return. The number returned may be less than this.
+   * @return [[Rating]] objects, each of which contains a user ID, the given product ID, and a
+   *  "score" in the rating field. Each represents one recommended user, and they are sorted
+   *  by score, decreasing. The first returned is the one predicted to be most strongly
+   *  recommended to the product. The score is an opaque value that indicates how strongly
+   *  recommended the user is.
+   */
+  def recommendUsers(product: Int, num: Int): Array[Rating] =
+    recommend(productFeatures.lookup(product).head, userFeatures, num)
+      .map(t => Rating(t._1, product, t._2))
+
+  private def recommend(
+      recommendToFeatures: Array[Double],
+      recommendableFeatures: RDD[(Int, Array[Double])],
+      num: Int): Array[(Int, Double)] = {
+    val recommendToVector = new DoubleMatrix(recommendToFeatures)
+    val scored = recommendableFeatures.map { case (id,features) =>
+      (id, recommendToVector.dot(new DoubleMatrix(features)))
+    }
+    scored.top(num)(Ordering.by(_._2))
+  }
+
   /**
    * :: DeveloperApi ::
    * Predict the rating of many users for many products.
@@ -80,6 +122,4 @@ class MatrixFactorizationModel private[mllib] (
     predict(usersProducts).map(rate => pythonAPI.serializeRating(rate))
   }
 
-  // TODO: Figure out what other good bulk prediction methods would look like.
-  // Probably want a way to get the top users for a product or vice-versa.
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 8cca926f1c92e..54854252d7477 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.mllib.regression
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
-
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.util.MLUtils._
 
 /**
  * :: DeveloperApi ::
@@ -57,9 +56,12 @@ abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double
     // A small optimization to avoid serializing the entire model. Only the weightsMatrix
     // and intercept is needed.
     val localWeights = weights
+    val bcWeights = testData.context.broadcast(localWeights)
     val localIntercept = intercept
-
-    testData.map(v => predictPoint(v, localWeights, localIntercept))
+    testData.mapPartitions { iter =>
+      val w = bcWeights.value
+      iter.map(v => predictPoint(v, w, localIntercept))
+    }
   }
 
   /**
@@ -124,16 +126,6 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
     run(input, initialWeights)
   }
 
-  /** Prepends one to the input vector. */
-  private def prependOne(vector: Vector): Vector = {
-    val vector1 = vector.toBreeze match {
-      case dv: BDV[Double] => BDV.vertcat(BDV.ones[Double](1), dv)
-      case sv: BSV[Double] => BSV.vertcat(new BSV[Double](Array(0), Array(1.0), 1), sv)
-      case v: Any => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
-    }
-    Vectors.fromBreeze(vector1)
-  }
-
   /**
    * Run the algorithm with the configured parameters on an input RDD
    * of LabeledPoint entries starting from the initial weights provided.
@@ -147,23 +139,23 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
 
     // Prepend an extra variable consisting of all 1.0's for the intercept.
     val data = if (addIntercept) {
-      input.map(labeledPoint => (labeledPoint.label, prependOne(labeledPoint.features)))
+      input.map(labeledPoint => (labeledPoint.label, appendBias(labeledPoint.features)))
     } else {
       input.map(labeledPoint => (labeledPoint.label, labeledPoint.features))
     }
 
     val initialWeightsWithIntercept = if (addIntercept) {
-      prependOne(initialWeights)
+      appendBias(initialWeights)
     } else {
       initialWeights
     }
 
     val weightsWithIntercept = optimizer.optimize(data, initialWeightsWithIntercept)
 
-    val intercept = if (addIntercept) weightsWithIntercept(0) else 0.0
+    val intercept = if (addIntercept) weightsWithIntercept(weightsWithIntercept.size - 1) else 0.0
     val weights =
       if (addIntercept) {
-        Vectors.dense(weightsWithIntercept.toArray.slice(1, weightsWithIntercept.size))
+        Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1))
       } else {
         weightsWithIntercept
       }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
index a05dfc045fb8e..cb0d39e759a9f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
@@ -28,7 +28,7 @@ import org.apache.spark.rdd.RDD
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  */
-class LassoModel private[mllib] (
+class LassoModel (
     override val weights: Vector,
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
index 0ebad4eb58d88..81b6598377ff5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
@@ -27,7 +27,7 @@ import org.apache.spark.mllib.optimization._
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  */
-class LinearRegressionModel private[mllib] (
+class LinearRegressionModel (
     override val weights: Vector,
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept) with RegressionModel with Serializable {
@@ -49,7 +49,7 @@ class LinearRegressionModel private[mllib] (
  * its corresponding right hand side label y.
  * See also the documentation for the precise formulation.
  */
-class LinearRegressionWithSGD private (
+class LinearRegressionWithSGD private[mllib] (
     private var stepSize: Double,
     private var numIterations: Int,
     private var miniBatchFraction: Double)
@@ -68,7 +68,7 @@ class LinearRegressionWithSGD private (
    */
   def this() = this(1.0, 100, 1.0)
 
-  override protected def createModel(weights: Vector, intercept: Double) = {
+  override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
     new LinearRegressionModel(weights, intercept)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index bd983bac001a0..a826deb695ee1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.linalg.Vector
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model.
  */
-class RidgeRegressionModel private[mllib] (
+class RidgeRegressionModel (
     override val weights: Vector,
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
new file mode 100644
index 0000000000000..b8b0b42611775
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.regression
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.Logging
+import org.apache.spark.streaming.dstream.DStream
+
+/**
+ * :: DeveloperApi ::
+ * StreamingLinearAlgorithm implements methods for continuously
+ * training a generalized linear model model on streaming data,
+ * and using it for prediction on (possibly different) streaming data.
+ *
+ * This class takes as type parameters a GeneralizedLinearModel,
+ * and a GeneralizedLinearAlgorithm, making it easy to extend to construct
+ * streaming versions of any analyses using GLMs.
+ * Initial weights must be set before calling trainOn or predictOn.
+ * Only weights will be updated, not an intercept. If the model needs
+ * an intercept, it should be manually appended to the input data.
+ *
+ * For example usage, see `StreamingLinearRegressionWithSGD`.
+ *
+ * NOTE(Freeman): In some use cases, the order in which trainOn and predictOn
+ * are called in an application will affect the results. When called on
+ * the same DStream, if trainOn is called before predictOn, when new data
+ * arrive the model will update and the prediction will be based on the new
+ * model. Whereas if predictOn is called first, the prediction will use the model
+ * from the previous update.
+ *
+ * NOTE(Freeman): It is ok to call predictOn repeatedly on multiple streams; this
+ * will generate predictions for each one all using the current model.
+ * It is also ok to call trainOn on different streams; this will update
+ * the model using each of the different sources, in sequence.
+ *
+ */
+@DeveloperApi
+abstract class StreamingLinearAlgorithm[
+    M <: GeneralizedLinearModel,
+    A <: GeneralizedLinearAlgorithm[M]] extends Logging {
+
+  /** The model to be updated and used for prediction. */
+  protected var model: M
+
+  /** The algorithm to use for updating. */
+  protected val algorithm: A
+
+  /** Return the latest model. */
+  def latestModel(): M = {
+    model
+  }
+
+  /**
+   * Update the model by training on batches of data from a DStream.
+   * This operation registers a DStream for training the model,
+   * and updates the model based on every subsequent
+   * batch of data from the stream.
+   *
+   * @param data DStream containing labeled data
+   */
+  def trainOn(data: DStream[LabeledPoint]) {
+    if (Option(model.weights) == None) {
+      logError("Initial weights must be set before starting training")
+      throw new IllegalArgumentException
+    }
+    data.foreachRDD { (rdd, time) =>
+        model = algorithm.run(rdd, model.weights)
+        logInfo("Model updated at time %s".format(time.toString))
+        val display = model.weights.size match {
+          case x if x > 100 => model.weights.toArray.take(100).mkString("[", ",", "...")
+          case _ => model.weights.toArray.mkString("[", ",", "]")
+        }
+        logInfo("Current model: weights, %s".format (display))
+    }
+  }
+
+  /**
+   * Use the model to make predictions on batches of data from a DStream
+   *
+   * @param data DStream containing labeled data
+   * @return DStream containing predictions
+   */
+  def predictOn(data: DStream[LabeledPoint]): DStream[Double] = {
+    if (Option(model.weights) == None) {
+      logError("Initial weights must be set before starting prediction")
+      throw new IllegalArgumentException
+    }
+    data.map(x => model.predict(x.features))
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
new file mode 100644
index 0000000000000..8851097050318
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.regression
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+
+/**
+ * Train or predict a linear regression model on streaming data. Training uses
+ * Stochastic Gradient Descent to update the model based on each new batch of
+ * incoming data from a DStream (see `LinearRegressionWithSGD` for model equation)
+ *
+ * Each batch of data is assumed to be an RDD of LabeledPoints.
+ * The number of data points per batch can vary, but the number
+ * of features must be constant. An initial weight
+ * vector must be provided.
+ *
+ * Use a builder pattern to construct a streaming linear regression
+ * analysis in an application, like:
+ *
+ *  val model = new StreamingLinearRegressionWithSGD()
+ *    .setStepSize(0.5)
+ *    .setNumIterations(10)
+ *    .setInitialWeights(Vectors.dense(...))
+ *    .trainOn(DStream)
+ *
+ */
+@Experimental
+class StreamingLinearRegressionWithSGD (
+    private var stepSize: Double,
+    private var numIterations: Int,
+    private var miniBatchFraction: Double,
+    private var initialWeights: Vector)
+  extends StreamingLinearAlgorithm[
+    LinearRegressionModel, LinearRegressionWithSGD] with Serializable {
+
+  /**
+   * Construct a StreamingLinearRegression object with default parameters:
+   * {stepSize: 0.1, numIterations: 50, miniBatchFraction: 1.0}.
+   * Initial weights must be set before using trainOn or predictOn
+   * (see `StreamingLinearAlgorithm`)
+   */
+  def this() = this(0.1, 50, 1.0, null)
+
+  val algorithm = new LinearRegressionWithSGD(stepSize, numIterations, miniBatchFraction)
+
+  var model = algorithm.createModel(initialWeights, 0.0)
+
+  /** Set the step size for gradient descent. Default: 0.1. */
+  def setStepSize(stepSize: Double): this.type = {
+    this.algorithm.optimizer.setStepSize(stepSize)
+    this
+  }
+
+  /** Set the number of iterations of gradient descent to run per update. Default: 50. */
+  def setNumIterations(numIterations: Int): this.type = {
+    this.algorithm.optimizer.setNumIterations(numIterations)
+    this
+  }
+
+  /** Set the fraction of each batch to use for updates. Default: 1.0. */
+  def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
+    this.algorithm.optimizer.setMiniBatchFraction(miniBatchFraction)
+    this
+  }
+
+  /** Set the initial weights. Default: [0.0, 0.0]. */
+  def setInitialWeights(initialWeights: Vector): this.type = {
+    this.model = algorithm.createModel(initialWeights, 0.0)
+    this
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
new file mode 100644
index 0000000000000..5105b5c37aaaa
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import breeze.linalg.{DenseVector => BDV}
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * :: DeveloperApi ::
+ * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean,
+ * variance, minimum, maximum, counts, and nonzero counts for samples in sparse or dense vector
+ * format in a online fashion.
+ *
+ * Two MultivariateOnlineSummarizer can be merged together to have a statistical summary of
+ * the corresponding joint dataset.
+ *
+ * A numerically stable algorithm is implemented to compute sample mean and variance:
+ * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
+ * Zero elements (including explicit zero values) are skipped when calling add(),
+ * to have time complexity O(nnz) instead of O(n) for each column.
+ */
+@DeveloperApi
+class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable {
+
+  private var n = 0
+  private var currMean: BDV[Double] = _
+  private var currM2n: BDV[Double] = _
+  private var totalCnt: Long = 0
+  private var nnz: BDV[Double] = _
+  private var currMax: BDV[Double] = _
+  private var currMin: BDV[Double] = _
+
+  /**
+   * Add a new sample to this summarizer, and update the statistical summary.
+   *
+   * @param sample The sample in dense/sparse vector format to be added into this summarizer.
+   * @return This MultivariateOnlineSummarizer object.
+   */
+  def add(sample: Vector): this.type = {
+    if (n == 0) {
+      require(sample.toBreeze.length > 0, s"Vector should have dimension larger than zero.")
+      n = sample.toBreeze.length
+
+      currMean = BDV.zeros[Double](n)
+      currM2n = BDV.zeros[Double](n)
+      nnz = BDV.zeros[Double](n)
+      currMax = BDV.fill(n)(Double.MinValue)
+      currMin = BDV.fill(n)(Double.MaxValue)
+    }
+
+    require(n == sample.toBreeze.length, s"Dimensions mismatch when adding new sample." +
+      s" Expecting $n but got ${sample.toBreeze.length}.")
+
+    sample.toBreeze.activeIterator.foreach {
+      case (_, 0.0) => // Skip explicit zero elements.
+      case (i, value) =>
+        if (currMax(i) < value) {
+          currMax(i) = value
+        }
+        if (currMin(i) > value) {
+          currMin(i) = value
+        }
+
+        val tmpPrevMean = currMean(i)
+        currMean(i) = (currMean(i) * nnz(i) + value) / (nnz(i) + 1.0)
+        currM2n(i) += (value - currMean(i)) * (value - tmpPrevMean)
+
+        nnz(i) += 1.0
+    }
+
+    totalCnt += 1
+    this
+  }
+
+  /**
+   * Merge another MultivariateOnlineSummarizer, and update the statistical summary.
+   * (Note that it's in place merging; as a result, `this` object will be modified.)
+   *
+   * @param other The other MultivariateOnlineSummarizer to be merged.
+   * @return This MultivariateOnlineSummarizer object.
+   */
+  def merge(other: MultivariateOnlineSummarizer): this.type = {
+   if (this.totalCnt != 0 && other.totalCnt != 0) {
+      require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
+        s"Expecting $n but got ${other.n}.")
+      totalCnt += other.totalCnt
+      val deltaMean: BDV[Double] = currMean - other.currMean
+      var i = 0
+      while (i < n) {
+        // merge mean together
+        if (other.currMean(i) != 0.0) {
+          currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) /
+            (nnz(i) + other.nnz(i))
+        }
+        // merge m2n together
+        if (nnz(i) + other.nnz(i) != 0.0) {
+          currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) /
+            (nnz(i) + other.nnz(i))
+        }
+        if (currMax(i) < other.currMax(i)) {
+          currMax(i) = other.currMax(i)
+        }
+        if (currMin(i) > other.currMin(i)) {
+          currMin(i) = other.currMin(i)
+        }
+        i += 1
+      }
+      nnz += other.nnz
+    } else if (totalCnt == 0 && other.totalCnt != 0) {
+      this.n = other.n
+      this.currMean = other.currMean.copy
+      this.currM2n = other.currM2n.copy
+      this.totalCnt = other.totalCnt
+      this.nnz = other.nnz.copy
+      this.currMax = other.currMax.copy
+      this.currMin = other.currMin.copy
+    }
+    this
+  }
+
+  override def mean: Vector = {
+    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+
+    val realMean = BDV.zeros[Double](n)
+    var i = 0
+    while (i < n) {
+      realMean(i) = currMean(i) * (nnz(i) / totalCnt)
+      i += 1
+    }
+    Vectors.fromBreeze(realMean)
+  }
+
+  override def variance: Vector = {
+    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+
+    val realVariance = BDV.zeros[Double](n)
+
+    val denominator = totalCnt - 1.0
+
+    // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
+    if (denominator > 0.0) {
+      val deltaMean = currMean
+      var i = 0
+      while (i < currM2n.size) {
+        realVariance(i) =
+          currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt
+        realVariance(i) /= denominator
+        i += 1
+      }
+    }
+
+    Vectors.fromBreeze(realVariance)
+  }
+
+  override def count: Long = totalCnt
+
+  override def numNonzeros: Vector = {
+    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+
+    Vectors.fromBreeze(nnz)
+  }
+
+  override def max: Vector = {
+    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+
+    var i = 0
+    while (i < n) {
+      if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0
+      i += 1
+    }
+    Vectors.fromBreeze(currMax)
+  }
+
+  override def min: Vector = {
+    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+
+    var i = 0
+    while (i < n) {
+      if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0
+      i += 1
+    }
+    Vectors.fromBreeze(currMin)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
new file mode 100644
index 0000000000000..f416a9fbb323d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.{Matrix, Vector}
+import org.apache.spark.mllib.stat.correlation.Correlations
+import org.apache.spark.rdd.RDD
+
+/**
+ * API for statistical functions in MLlib.
+ */
+@Experimental
+object Statistics {
+
+  /**
+   * :: Experimental ::
+   * Compute the Pearson correlation matrix for the input RDD of Vectors.
+   * Columns with 0 covariance produce NaN entries in the correlation matrix.
+   *
+   * @param X an RDD[Vector] for which the correlation matrix is to be computed.
+   * @return Pearson correlation matrix comparing columns in X.
+   */
+  @Experimental
+  def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
+
+  /**
+   * :: Experimental ::
+   * Compute the correlation matrix for the input RDD of Vectors using the specified method.
+   * Methods currently supported: `pearson` (default), `spearman`.
+   *
+   * Note that for Spearman, a rank correlation, we need to create an RDD[Double] for each column
+   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
+   * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
+   * avoid recomputing the common lineage.
+   *
+   * @param X an RDD[Vector] for which the correlation matrix is to be computed.
+   * @param method String specifying the method to use for computing correlation.
+   *               Supported: `pearson` (default), `spearman`
+   * @return Correlation matrix comparing columns in X.
+   */
+  @Experimental
+  def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
+
+  /**
+   * :: Experimental ::
+   * Compute the Pearson correlation for the input RDDs.
+   * Returns NaN if either vector has 0 variance.
+   *
+   * Note: the two input RDDs need to have the same number of partitions and the same number of
+   * elements in each partition.
+   *
+   * @param x RDD[Double] of the same cardinality as y.
+   * @param y RDD[Double] of the same cardinality as x.
+   * @return A Double containing the Pearson correlation between the two input RDD[Double]s
+   */
+  @Experimental
+  def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
+
+  /**
+   * :: Experimental ::
+   * Compute the correlation for the input RDDs using the specified method.
+   * Methods currently supported: `pearson` (default), `spearman`.
+   *
+   * Note: the two input RDDs need to have the same number of partitions and the same number of
+   * elements in each partition.
+   *
+   * @param x RDD[Double] of the same cardinality as y.
+   * @param y RDD[Double] of the same cardinality as x.
+   * @param method String specifying the method to use for computing correlation.
+   *               Supported: `pearson` (default), `spearman`
+   *@return A Double containing the correlation between the two input RDD[Double]s using the
+   *         specified method.
+   */
+  @Experimental
+  def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/Correlation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/Correlation.scala
new file mode 100644
index 0000000000000..1fb8d7b3d4f32
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/Correlation.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.correlation
+
+import org.apache.spark.mllib.linalg.{DenseVector, Matrix, Vector}
+import org.apache.spark.rdd.RDD
+
+/**
+ * Trait for correlation algorithms.
+ */
+private[stat] trait Correlation {
+
+  /**
+   * Compute correlation for two datasets.
+   */
+  def computeCorrelation(x: RDD[Double], y: RDD[Double]): Double
+
+  /**
+   * Compute the correlation matrix S, for the input matrix, where S(i, j) is the correlation
+   * between column i and j. S(i, j) can be NaN if the correlation is undefined for column i and j.
+   */
+  def computeCorrelationMatrix(X: RDD[Vector]): Matrix
+
+  /**
+   * Combine the two input RDD[Double]s into an RDD[Vector] and compute the correlation using the
+   * correlation implementation for RDD[Vector]. Can be NaN if correlation is undefined for the
+   * input vectors.
+   */
+  def computeCorrelationWithMatrixImpl(x: RDD[Double], y: RDD[Double]): Double = {
+    val mat: RDD[Vector] = x.zip(y).map { case (xi, yi) => new DenseVector(Array(xi, yi)) }
+    computeCorrelationMatrix(mat)(0, 1)
+  }
+
+}
+
+/**
+ * Delegates computation to the specific correlation object based on the input method name.
+ */
+private[stat] object Correlations {
+
+  def corr(x: RDD[Double],
+       y: RDD[Double],
+       method: String = CorrelationNames.defaultCorrName): Double = {
+    val correlation = getCorrelationFromName(method)
+    correlation.computeCorrelation(x, y)
+  }
+
+  def corrMatrix(X: RDD[Vector],
+      method: String = CorrelationNames.defaultCorrName): Matrix = {
+    val correlation = getCorrelationFromName(method)
+    correlation.computeCorrelationMatrix(X)
+  }
+
+  // Match input correlation name with a known name via simple string matching.
+  def getCorrelationFromName(method: String): Correlation = {
+    try {
+      CorrelationNames.nameToObjectMap(method)
+    } catch {
+      case nse: NoSuchElementException =>
+        throw new IllegalArgumentException("Unrecognized method name. Supported correlations: "
+          + CorrelationNames.nameToObjectMap.keys.mkString(", "))
+    }
+  }
+}
+
+/**
+ * Maintains supported and default correlation names.
+ *
+ * Currently supported correlations: `pearson`, `spearman`.
+ * Current default correlation: `pearson`.
+ *
+ * After new correlation algorithms are added, please update the documentation here and in
+ * Statistics.scala for the correlation APIs.
+ */
+private[mllib] object CorrelationNames {
+
+  // Note: after new types of correlations are implemented, please update this map.
+  val nameToObjectMap = Map(("pearson", PearsonCorrelation), ("spearman", SpearmanCorrelation))
+  val defaultCorrName: String = "pearson"
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/PearsonCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/PearsonCorrelation.scala
new file mode 100644
index 0000000000000..23b291eee070b
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/PearsonCorrelation.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.correlation
+
+import breeze.linalg.{DenseMatrix => BDM}
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+import org.apache.spark.rdd.RDD
+
+/**
+ * Compute Pearson correlation for two RDDs of the type RDD[Double] or the correlation matrix
+ * for an RDD of the type RDD[Vector].
+ *
+ * Definition of Pearson correlation can be found at
+ * http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
+ */
+private[stat] object PearsonCorrelation extends Correlation with Logging {
+
+  /**
+   * Compute the Pearson correlation for two datasets. NaN if either vector has 0 variance.
+   */
+  override def computeCorrelation(x: RDD[Double], y: RDD[Double]): Double = {
+    computeCorrelationWithMatrixImpl(x, y)
+  }
+
+  /**
+   * Compute the Pearson correlation matrix S, for the input matrix, where S(i, j) is the
+   * correlation between column i and j. 0 covariance results in a correlation value of Double.NaN.
+   */
+  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
+    val rowMatrix = new RowMatrix(X)
+    val cov = rowMatrix.computeCovariance()
+    computeCorrelationMatrixFromCovariance(cov)
+  }
+
+  /**
+   * Compute the Pearson correlation matrix from the covariance matrix.
+   * 0 covariance results in a correlation value of Double.NaN.
+   */
+  def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = {
+    val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]]
+    val n = cov.cols
+
+    // Compute the standard deviation on the diagonals first
+    var i = 0
+    while (i < n) {
+      // TODO remove once covariance numerical issue resolved.
+      cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i))
+      i +=1
+    }
+
+    // Loop through columns since cov is column major
+    var j = 0
+    var sigma = 0.0
+    var containNaN = false
+    while (j < n) {
+      sigma = cov(j, j)
+      i = 0
+      while (i < j) {
+        val corr = if (sigma == 0.0 || cov(i, i) == 0.0) {
+          containNaN = true
+          Double.NaN
+        } else {
+          cov(i, j) / (sigma * cov(i, i))
+        }
+        cov(i, j) = corr
+        cov(j, i) = corr
+        i += 1
+      }
+      j += 1
+    }
+
+    // put 1.0 on the diagonals
+    i = 0
+    while (i < n) {
+      cov(i, i) = 1.0
+      i +=1
+    }
+
+    if (containNaN) {
+      logWarning("Pearson correlation matrix contains NaN values.")
+    }
+
+    Matrices.fromBreeze(cov)
+  }
+
+  private def closeToZero(value: Double, threshhold: Double = 1e-12): Boolean = {
+    math.abs(value) <= threshhold
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
new file mode 100644
index 0000000000000..9bd0c2cd05de4
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.correlation
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.{Logging, HashPartitioner}
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.{DenseVector, Matrix, Vector}
+import org.apache.spark.rdd.{CoGroupedRDD, RDD}
+
+/**
+ * Compute Spearman's correlation for two RDDs of the type RDD[Double] or the correlation matrix
+ * for an RDD of the type RDD[Vector].
+ *
+ * Definition of Spearman's correlation can be found at
+ * http://en.wikipedia.org/wiki/Spearman's_rank_correlation_coefficient
+ */
+private[stat] object SpearmanCorrelation extends Correlation with Logging {
+
+  /**
+   * Compute Spearman's correlation for two datasets.
+   */
+  override def computeCorrelation(x: RDD[Double], y: RDD[Double]): Double = {
+    computeCorrelationWithMatrixImpl(x, y)
+  }
+
+  /**
+   * Compute Spearman's correlation matrix S, for the input matrix, where S(i, j) is the
+   * correlation between column i and j.
+   *
+   * Input RDD[Vector] should be cached or checkpointed if possible since it would be split into
+   * numCol RDD[Double]s, each of which sorted, and the joined back into a single RDD[Vector].
+   */
+  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
+    val indexed = X.zipWithUniqueId()
+
+    val numCols = X.first.size
+    if (numCols > 50) {
+      logWarning("Computing the Spearman correlation matrix can be slow for large RDDs with more"
+        + " than 50 columns.")
+    }
+    val ranks = new Array[RDD[(Long, Double)]](numCols)
+
+    // Note: we use a for loop here instead of a while loop with a single index variable
+    // to avoid race condition caused by closure serialization
+    for (k <- 0 until numCols) {
+      val column = indexed.map { case (vector, index) => (vector(k), index) }
+      ranks(k) = getRanks(column)
+    }
+
+    val ranksMat: RDD[Vector] = makeRankMatrix(ranks, X)
+    PearsonCorrelation.computeCorrelationMatrix(ranksMat)
+  }
+
+  /**
+   * Compute the ranks for elements in the input RDD, using the average method for ties.
+   *
+   * With the average method, elements with the same value receive the same rank that's computed
+   * by taking the average of their positions in the sorted list.
+   * e.g. ranks([2, 1, 0, 2]) = [2.5, 1.0, 0.0, 2.5]
+   * Note that positions here are 0-indexed, instead of the 1-indexed as in the definition for
+   * ranks in the standard definition for Spearman's correlation. This does not affect the final
+   * results and is slightly more performant.
+   *
+   * @param indexed RDD[(Double, Long)] containing pairs of the format (originalValue, uniqueId)
+   * @return RDD[(Long, Double)] containing pairs of the format (uniqueId, rank), where uniqueId is
+   *         copied from the input RDD.
+   */
+  private def getRanks(indexed: RDD[(Double, Long)]): RDD[(Long, Double)] = {
+    // Get elements' positions in the sorted list for computing average rank for duplicate values
+    val sorted = indexed.sortByKey().zipWithIndex()
+
+    val ranks: RDD[(Long, Double)] = sorted.mapPartitions { iter =>
+      // add an extra element to signify the end of the list so that flatMap can flush the last
+      // batch of duplicates
+      val end = -1L
+      val padded = iter ++ Iterator[((Double, Long), Long)](((Double.NaN, end), end))
+      val firstEntry = padded.next()
+      var lastVal = firstEntry._1._1
+      var firstRank = firstEntry._2.toDouble
+      val idBuffer = ArrayBuffer(firstEntry._1._2)
+      padded.flatMap { case ((v, id), rank) =>
+        if (v == lastVal && id != end) {
+          idBuffer += id
+          Iterator.empty
+        } else {
+          val entries = if (idBuffer.size == 1) {
+            Iterator((idBuffer(0), firstRank))
+          } else {
+            val averageRank = firstRank + (idBuffer.size - 1.0) / 2.0
+            idBuffer.map(id => (id, averageRank))
+          }
+          lastVal = v
+          firstRank = rank
+          idBuffer.clear()
+          idBuffer += id
+          entries
+        }
+      }
+    }
+    ranks
+  }
+
+  private def makeRankMatrix(ranks: Array[RDD[(Long, Double)]], input: RDD[Vector]): RDD[Vector] = {
+    val partitioner = new HashPartitioner(input.partitions.size)
+    val cogrouped = new CoGroupedRDD[Long](ranks, partitioner)
+    cogrouped.map {
+      case (_, values: Array[Iterable[_]]) =>
+        val doubles = values.asInstanceOf[Array[Iterable[Double]]]
+        new DenseVector(doubles.flatten.toArray)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 3b13e52a7b445..1d03e6e3b36cf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -31,8 +31,8 @@ import org.apache.spark.util.random.XORShiftRandom
 
 /**
  * :: Experimental ::
- * A class that implements a decision tree algorithm for classification and regression. It
- * supports both continuous and categorical features.
+ * A class which implements a decision tree learning algorithm for classification and regression.
+ * It supports both continuous and categorical features.
  * @param strategy The configuration parameters for the tree algorithm which specify the type
  *                 of algorithm (classification, regression, etc.), feature type (continuous,
  *                 categorical), depth of the tree, quantile calculation strategy, etc.
@@ -42,25 +42,25 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
 
   /**
    * Method to train a decision tree model over an RDD
-   * @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data
-   * @return a DecisionTreeModel that can be used for prediction
+   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @return DecisionTreeModel that can be used for prediction
    */
   def train(input: RDD[LabeledPoint]): DecisionTreeModel = {
 
     // Cache input RDD for speedup during multiple passes.
-    input.cache()
+    val retaggedInput = input.retag(classOf[LabeledPoint]).cache()
     logDebug("algo = " + strategy.algo)
 
     // Find the splits and the corresponding bins (interval between the splits) using a sample
     // of the input data.
-    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+    val (splits, bins) = DecisionTree.findSplitsBins(retaggedInput, strategy)
     val numBins = bins(0).length
     logDebug("numBins = " + numBins)
 
     // depth of the decision tree
     val maxDepth = strategy.maxDepth
     // the max number of nodes possible given the depth of the tree
-    val maxNumNodes = math.pow(2, maxDepth).toInt - 1
+    val maxNumNodes = math.pow(2, maxDepth + 1).toInt - 1
     // Initialize an array to hold filters applied to points for each node.
     val filters = new Array[List[Filter]](maxNumNodes)
     // The filter at the top node is an empty list.
@@ -70,18 +70,16 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     // dummy value for top node (updated during first split calculation)
     val nodes = new Array[Node](maxNumNodes)
     // num features
-    val numFeatures = input.take(1)(0).features.size
+    val numFeatures = retaggedInput.take(1)(0).features.size
 
     // Calculate level for single group construction
 
     // Max memory usage for aggregates
     val maxMemoryUsage = strategy.maxMemoryInMB * 1024 * 1024
     logDebug("max memory usage for aggregates = " + maxMemoryUsage + " bytes.")
-    val numElementsPerNode =
-      strategy.algo match {
-        case Classification => 2 * numBins * numFeatures
-        case Regression => 3 * numBins * numFeatures
-      }
+    val numElementsPerNode = DecisionTree.getElementsPerNode(numFeatures, numBins,
+      strategy.numClassesForClassification, strategy.isMulticlassWithCategoricalFeatures,
+      strategy.algo)
 
     logDebug("numElementsPerNode = " + numElementsPerNode)
     val arraySizePerNode = 8 * numElementsPerNode // approx. memory usage for bin aggregate array
@@ -102,15 +100,15 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
 
     var level = 0
     var break = false
-    while (level < maxDepth && !break) {
+    while (level <= maxDepth && !break) {
 
       logDebug("#####################################")
       logDebug("level = " + level)
       logDebug("#####################################")
 
       // Find best split for all nodes at a level.
-      val splitsStatsForLevel = DecisionTree.findBestSplits(input, parentImpurities, strategy,
-        level, filters, splits, bins, maxLevelForSingleGroup)
+      val splitsStatsForLevel = DecisionTree.findBestSplits(retaggedInput, parentImpurities,
+        strategy, level, filters, splits, bins, maxLevelForSingleGroup)
 
       for ((nodeSplitStats, index) <- splitsStatsForLevel.view.zipWithIndex) {
         // Extract info for nodes at the current level.
@@ -154,7 +152,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     val split = nodeSplitStats._1
     val stats = nodeSplitStats._2
     val nodeIndex = math.pow(2, level).toInt - 1 + index
-    val isLeaf = (stats.gain <= 0) || (level == strategy.maxDepth - 1)
+    val isLeaf = (stats.gain <= 0) || (level == strategy.maxDepth)
     val node = new Node(nodeIndex, stats.predict, isLeaf, Some(split), None, None, Some(stats))
     logDebug("Node = " + node)
     nodes(nodeIndex) = node
@@ -175,7 +173,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     while (i <= 1) {
      // Calculate the index of the node from the node level and the index at the current level.
       val nodeIndex = math.pow(2, level + 1).toInt - 1 + 2 * index + i
-      if (level < maxDepth - 1) {
+      if (level < maxDepth) {
         val impurity = if (i == 0) {
           nodeSplitStats._2.leftImpurity
         } else {
@@ -199,20 +197,19 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
 object DecisionTree extends Serializable with Logging {
 
   /**
-   * Method to train a decision tree model where the instances are represented as an RDD of
-   * (label, features) pairs. The method supports binary classification and regression. For the
-   * binary classification, the label for each instance should either be 0 or 1 to denote the two
-   * classes. The parameters for the algorithm are specified using the strategy parameter.
+   * Method to train a decision tree model.
+   * The method supports binary and multiclass classification and regression.
    *
-   * @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data
-   *              for DecisionTree
+   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   *              For classification, labels should take values {0, 1, ..., numClasses-1}.
+   *              For regression, labels are real numbers.
    * @param strategy The configuration parameters for the tree algorithm which specify the type
    *                 of algorithm (classification, regression, etc.), feature type (continuous,
    *                 categorical), depth of the tree, quantile calculation strategy, etc.
-   * @return a DecisionTreeModel that can be used for prediction
+   * @return DecisionTreeModel that can be used for prediction
   */
   def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = {
-    new DecisionTree(strategy).train(input: RDD[LabeledPoint])
+    new DecisionTree(strategy).train(input)
   }
 
   /**
@@ -221,22 +218,49 @@ object DecisionTree extends Serializable with Logging {
    * binary classification, the label for each instance should either be 0 or 1 to denote the two
    * classes.
    *
-   * @param input input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as
-   *              training data
+   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   *              For classification, labels should take values {0, 1, ..., numClasses-1}.
+   *              For regression, labels are real numbers.
    * @param algo algorithm, classification or regression
    * @param impurity impurity criterion used for information gain calculation
-   * @param maxDepth maxDepth maximum depth of the tree
-   * @return a DecisionTreeModel that can be used for prediction
+   * @param maxDepth Maximum depth of the tree.
+   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+   * @return DecisionTreeModel that can be used for prediction
    */
   def train(
       input: RDD[LabeledPoint],
       algo: Algo,
       impurity: Impurity,
       maxDepth: Int): DecisionTreeModel = {
-    val strategy = new Strategy(algo,impurity,maxDepth)
-    new DecisionTree(strategy).train(input: RDD[LabeledPoint])
+    val strategy = new Strategy(algo, impurity, maxDepth)
+    new DecisionTree(strategy).train(input)
   }
 
+  /**
+   * Method to train a decision tree model where the instances are represented as an RDD of
+   * (label, features) pairs. The method supports binary classification and regression. For the
+   * binary classification, the label for each instance should either be 0 or 1 to denote the two
+   * classes.
+   *
+   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   *              For classification, labels should take values {0, 1, ..., numClasses-1}.
+   *              For regression, labels are real numbers.
+   * @param algo algorithm, classification or regression
+   * @param impurity impurity criterion used for information gain calculation
+   * @param maxDepth Maximum depth of the tree.
+   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+   * @param numClassesForClassification number of classes for classification. Default value of 2.
+   * @return DecisionTreeModel that can be used for prediction
+   */
+  def train(
+      input: RDD[LabeledPoint],
+      algo: Algo,
+      impurity: Impurity,
+      maxDepth: Int,
+      numClassesForClassification: Int): DecisionTreeModel = {
+    val strategy = new Strategy(algo, impurity, maxDepth, numClassesForClassification)
+    new DecisionTree(strategy).train(input)
+  }
 
   /**
    * Method to train a decision tree model where the instances are represented as an RDD of
@@ -245,11 +269,14 @@ object DecisionTree extends Serializable with Logging {
    * 1 to denote the two classes. The method also supports categorical features inputs where the
    * number of categories can specified using the categoricalFeaturesInfo option.
    *
-   * @param input input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as
-   *              training data for DecisionTree
+   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   *              For classification, labels should take values {0, 1, ..., numClasses-1}.
+   *              For regression, labels are real numbers.
    * @param algo classification or regression
    * @param impurity criterion used for information gain calculation
-   * @param maxDepth  maximum depth of the tree
+   * @param maxDepth Maximum depth of the tree.
+   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+   * @param numClassesForClassification number of classes for classification. Default value of 2.
    * @param maxBins maximum number of bins used for splitting features
    * @param quantileCalculationStrategy  algorithm for calculating quantiles
    * @param categoricalFeaturesInfo A map storing information about the categorical variables and
@@ -257,19 +284,20 @@ object DecisionTree extends Serializable with Logging {
    *                                an entry (n -> k) implies the feature n is categorical with k
    *                                categories 0, 1, 2, ... , k-1. It's important to note that
    *                                features are zero-indexed.
-   * @return a DecisionTreeModel that can be used for prediction
+   * @return DecisionTreeModel that can be used for prediction
    */
   def train(
       input: RDD[LabeledPoint],
       algo: Algo,
       impurity: Impurity,
       maxDepth: Int,
+      numClassesForClassification: Int,
       maxBins: Int,
       quantileCalculationStrategy: QuantileStrategy,
       categoricalFeaturesInfo: Map[Int,Int]): DecisionTreeModel = {
-    val strategy = new Strategy(algo, impurity, maxDepth, maxBins, quantileCalculationStrategy,
-      categoricalFeaturesInfo)
-    new DecisionTree(strategy).train(input: RDD[LabeledPoint])
+    val strategy = new Strategy(algo, impurity, maxDepth, numClassesForClassification, maxBins,
+      quantileCalculationStrategy, categoricalFeaturesInfo)
+    new DecisionTree(strategy).train(input)
   }
 
   private val InvalidBinIndex = -1
@@ -278,11 +306,10 @@ object DecisionTree extends Serializable with Logging {
    * Returns an array of optimal splits for all nodes at a given level. Splits the task into
    * multiple groups if the level-wise training task could lead to memory overflow.
    *
-   * @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data
-   *              for DecisionTree
+   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
    * @param parentImpurities Impurities for all parent nodes for the current level
    * @param strategy [[org.apache.spark.mllib.tree.configuration.Strategy]] instance containing
-   *                parameters for construction the DecisionTree
+   *                 parameters for constructing the DecisionTree
    * @param level Level of the tree
    * @param filters Filters for all nodes at a given level
    * @param splits possible splits for all features
@@ -325,11 +352,10 @@ object DecisionTree extends Serializable with Logging {
     /**
    * Returns an array of optimal splits for a group of nodes at a given level
    *
-   * @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data
-   *              for DecisionTree
+   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
    * @param parentImpurities Impurities for all parent nodes for the current level
    * @param strategy [[org.apache.spark.mllib.tree.configuration.Strategy]] instance containing
-   *                parameters for construction the DecisionTree
+   *                 parameters for constructing the DecisionTree
    * @param level Level of the tree
    * @param filters Filters for all nodes at a given level
    * @param splits possible splits for all features
@@ -350,7 +376,7 @@ object DecisionTree extends Serializable with Logging {
       groupIndex: Int = 0): Array[(Split, InformationGainStats)] = {
 
     /*
-     * The high-level description for the best split optimizations are noted here.
+     * The high-level descriptions of the best split optimizations are noted here.
      *
      * *Level-wise training*
      * We perform bin calculations for all nodes at the given level to avoid making multiple
@@ -373,15 +399,32 @@ object DecisionTree extends Serializable with Logging {
      * drastically reduce the communication overhead.
      */
 
-    // common calculations for multiple nested methods
+    // Common calculations for multiple nested methods:
+
+    // numNodes:  Number of nodes in this (level of tree, group),
+    //            where nodes at deeper (larger) levels may be divided into groups.
     val numNodes = math.pow(2, level).toInt / numGroups
     logDebug("numNodes = " + numNodes)
+
     // Find the number of features by looking at the first sample.
     val numFeatures = input.first().features.size
     logDebug("numFeatures = " + numFeatures)
+
+    // numBins:  Number of bins = 1 + number of possible splits
     val numBins = bins(0).length
     logDebug("numBins = " + numBins)
 
+    val numClasses = strategy.numClassesForClassification
+    logDebug("numClasses = " + numClasses)
+
+    val isMulticlassClassification = strategy.isMulticlassClassification
+    logDebug("isMulticlassClassification = " + isMulticlassClassification)
+
+    val isMulticlassClassificationWithCategoricalFeatures
+      = strategy.isMulticlassWithCategoricalFeatures
+    logDebug("isMultiClassWithCategoricalFeatures = " +
+      isMulticlassClassificationWithCategoricalFeatures)
+
     // shift when more than one group is used at deep tree level
     val groupShift = numNodes * groupIndex
 
@@ -434,12 +477,13 @@ object DecisionTree extends Serializable with Logging {
     }
 
     /**
-     * Find bin for one feature.
+     * Find bin for one (labeledPoint, feature).
      */
     def findBin(
         featureIndex: Int,
         labeledPoint: LabeledPoint,
-        isFeatureContinuous: Boolean): Int = {
+        isFeatureContinuous: Boolean,
+        isSpaceSufficientForAllCategoricalSplits: Boolean): Int = {
       val binForFeatures = bins(featureIndex)
       val feature = labeledPoint.features(featureIndex)
 
@@ -454,7 +498,7 @@ object DecisionTree extends Serializable with Logging {
           val bin = binForFeatures(mid)
           val lowThreshold = bin.lowSplit.threshold
           val highThreshold = bin.highSplit.threshold
-          if ((lowThreshold < feature) && (highThreshold >= feature)){
+          if ((lowThreshold < feature) && (highThreshold >= feature)) {
             return mid
           }
           else if (lowThreshold >= feature) {
@@ -468,34 +512,61 @@ object DecisionTree extends Serializable with Logging {
       }
 
       /**
-       * Sequential search helper method to find bin for categorical feature.
+       * Sequential search helper method to find bin for categorical feature in multiclass
+       * classification. The category is returned since each category can belong to multiple
+       * splits. The actual left/right child allocation per split is performed in the
+       * sequential phase of the bin aggregate operation.
+       */
+      def sequentialBinSearchForUnorderedCategoricalFeatureInClassification(): Int = {
+        labeledPoint.features(featureIndex).toInt
+      }
+
+      /**
+       * Sequential search helper method to find bin for categorical feature
+       * (for classification and regression).
        */
-      def sequentialBinSearchForCategoricalFeature(): Int = {
-        val numCategoricalBins = strategy.categoricalFeaturesInfo(featureIndex)
+      def sequentialBinSearchForOrderedCategoricalFeature(): Int = {
+        val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
+        val featureValue = labeledPoint.features(featureIndex)
         var binIndex = 0
-        while (binIndex < numCategoricalBins) {
+        while (binIndex < featureCategories) {
           val bin = bins(featureIndex)(binIndex)
-          val category = bin.category
-          val features = labeledPoint.features
-          if (category == features(featureIndex)) {
+          val categories = bin.highSplit.categories
+          if (categories.contains(featureValue)) {
             return binIndex
           }
           binIndex += 1
         }
+        if (featureValue < 0 || featureValue >= featureCategories) {
+          throw new IllegalArgumentException(
+            s"DecisionTree given invalid data:" +
+            s" Feature $featureIndex is categorical with values in" +
+            s" {0,...,${featureCategories - 1}," +
+            s" but a data point gives it value $featureValue.\n" +
+            "  Bad data point: " + labeledPoint.toString)
+        }
         -1
       }
 
       if (isFeatureContinuous) {
         // Perform binary search for finding bin for continuous features.
         val binIndex = binarySearchForBins()
-        if (binIndex == -1){
+        if (binIndex == -1) {
           throw new UnknownError("no bin was found for continuous variable.")
         }
         binIndex
       } else {
         // Perform sequential search to find bin for categorical features.
-        val binIndex = sequentialBinSearchForCategoricalFeature()
-        if (binIndex == -1){
+        val binIndex = {
+          val isUnorderedFeature =
+            isMulticlassClassification && isSpaceSufficientForAllCategoricalSplits
+          if (isUnorderedFeature) {
+            sequentialBinSearchForUnorderedCategoricalFeatureInClassification()
+          } else {
+            sequentialBinSearchForOrderedCategoricalFeature()
+          }
+        }
+        if (binIndex == -1) {
           throw new UnknownError("no bin was found for categorical variable.")
         }
         binIndex
@@ -506,13 +577,24 @@ object DecisionTree extends Serializable with Logging {
      * Finds bins for all nodes (and all features) at a given level.
      * For l nodes, k features the storage is as follows:
      * label, b_11, b_12, .. , b_1k, b_21, b_22, .. , b_2k, b_l1, b_l2, .. , b_lk,
-     * where b_ij is an integer between 0 and numBins - 1.
+     * where b_ij is an integer between 0 and numBins - 1 for regressions and binary
+     * classification and the categorical feature value in  multiclass classification.
      * Invalid sample is denoted by noting bin for feature 1 as -1.
+     *
+     * For unordered features, the "bin index" returned is actually the feature value (category).
+     *
+     * @return  Array of size 1 + numFeatures * numNodes, where
+     *          arr(0) = label for labeledPoint, and
+     *          arr(1 + numFeatures * nodeIndex + featureIndex) =
+     *            bin index for this labeledPoint
+     *            (or InvalidBinIndex if labeledPoint is not handled by this node)
      */
     def findBinsForLevel(labeledPoint: LabeledPoint): Array[Double] = {
       // Calculate bin index and label per feature per node.
       val arr = new Array[Double](1 + (numFeatures * numNodes))
+      // First element of the array is the label of the instance.
       arr(0) = labeledPoint.label
+      // Iterate over nodes.
       var nodeIndex = 0
       while (nodeIndex < numNodes) {
         val parentFilters = findParentFilters(nodeIndex)
@@ -525,8 +607,19 @@ object DecisionTree extends Serializable with Logging {
         } else {
           var featureIndex = 0
           while (featureIndex < numFeatures) {
-            val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
-            arr(shift + featureIndex) = findBin(featureIndex, labeledPoint,isFeatureContinuous)
+            val featureInfo = strategy.categoricalFeaturesInfo.get(featureIndex)
+            val isFeatureContinuous = featureInfo.isEmpty
+            if (isFeatureContinuous) {
+              arr(shift + featureIndex)
+                = findBin(featureIndex, labeledPoint, isFeatureContinuous, false)
+            } else {
+              val featureCategories = featureInfo.get
+              val isSpaceSufficientForAllCategoricalSplits
+                = numBins > math.pow(2, featureCategories.toInt - 1) - 1
+              arr(shift + featureIndex)
+                = findBin(featureIndex, labeledPoint, isFeatureContinuous,
+                isSpaceSufficientForAllCategoricalSplits)
+            }
             featureIndex += 1
           }
         }
@@ -535,18 +628,88 @@ object DecisionTree extends Serializable with Logging {
       arr
     }
 
+     // Find feature bins for all nodes at a level.
+    val binMappedRDD = input.map(x => findBinsForLevel(x))
+
+    /**
+     * Increment aggregate in location for (node, feature, bin, label).
+     *
+     * @param arr  Bin mapping from findBinsForLevel.  arr(0) stores the class label.
+     *             Array of size 1 + (numFeatures * numNodes).
+     * @param agg  Array storing aggregate calculation, of size:
+     *             numClasses * numBins * numFeatures * numNodes.
+     *             Indexed by (node, feature, bin, label) where label is the least significant bit.
+     */
+    def updateBinForOrderedFeature(
+        arr: Array[Double],
+        agg: Array[Double],
+        nodeIndex: Int,
+        label: Double,
+        featureIndex: Int): Unit = {
+      // Find the bin index for this feature.
+      val arrShift = 1 + numFeatures * nodeIndex
+      val arrIndex = arrShift + featureIndex
+      // Update the left or right count for one bin.
+      val aggIndex =
+        numClasses * numBins * numFeatures * nodeIndex +
+        numClasses * numBins * featureIndex +
+        numClasses * arr(arrIndex).toInt +
+        label.toInt
+      agg(aggIndex) += 1
+    }
+
     /**
-     * Performs a sequential aggregation over a partition for classification. For l nodes,
-     * k features, either the left count or the right count of one of the p bins is
-     * incremented based upon whether the feature is classified as 0 or 1.
+     * Increment aggregate in location for (nodeIndex, featureIndex, [bins], label),
+     * where [bins] ranges over all bins.
+     * Updates left or right side of aggregate depending on split.
      *
-     * @param agg Array[Double] storing aggregate calculation of size
-     *            2 * numSplits * numFeatures*numNodes for classification
-     * @param arr Array[Double] of size 1 + (numFeatures * numNodes)
-     * @return Array[Double] storing aggregate calculation of size
-     *         2 * numSplits * numFeatures * numNodes for classification
+     * @param arr  arr(0) = label.
+     *             arr(1 + featureIndex + nodeIndex * numFeatures) = feature value (category)
+     * @param agg  Indexed by (left/right, node, feature, bin, label)
+     *             where label is the least significant bit.
+     *             The left/right specifier is a 0/1 index indicating left/right child info.
+     * @param rightChildShift Offset for right side of agg.
      */
-    def classificationBinSeqOp(arr: Array[Double], agg: Array[Double]) {
+    def updateBinForUnorderedFeature(
+        nodeIndex: Int,
+        featureIndex: Int,
+        arr: Array[Double],
+        label: Double,
+        agg: Array[Double],
+        rightChildShift: Int): Unit = {
+      // Find the bin index for this feature.
+      val arrIndex = 1 + numFeatures * nodeIndex + featureIndex
+      val featureValue = arr(arrIndex).toInt
+      // Update the left or right count for one bin.
+      val aggShift =
+        numClasses * numBins * numFeatures * nodeIndex +
+        numClasses * numBins * featureIndex +
+        label.toInt
+      // Find all matching bins and increment their values
+      val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
+      val numCategoricalBins = math.pow(2.0, featureCategories - 1).toInt - 1
+      var binIndex = 0
+      while (binIndex < numCategoricalBins) {
+        val aggIndex = aggShift + binIndex * numClasses
+        if (bins(featureIndex)(binIndex).highSplit.categories.contains(featureValue)) {
+          agg(aggIndex) += 1
+        } else {
+          agg(rightChildShift + aggIndex) += 1
+        }
+        binIndex += 1
+      }
+    }
+
+    /**
+     * Helper for binSeqOp.
+     *
+     * @param arr  Bin mapping from findBinsForLevel. arr(0) stores the class label.
+     *             Array of size 1 + (numFeatures * numNodes).
+     * @param agg  Array storing aggregate calculation, of size:
+     *             numClasses * numBins * numFeatures * numNodes.
+     *             Indexed by (node, feature, bin, label) where label is the least significant bit.
+     */
+    def binaryOrNotCategoricalBinSeqOp(arr: Array[Double], agg: Array[Double]): Unit = {
       // Iterate over all nodes.
       var nodeIndex = 0
       while (nodeIndex < numNodes) {
@@ -559,15 +722,56 @@ object DecisionTree extends Serializable with Logging {
           // Iterate over all features.
           var featureIndex = 0
           while (featureIndex < numFeatures) {
-            // Find the bin index for this feature.
-            val arrShift = 1 + numFeatures * nodeIndex
-            val arrIndex = arrShift + featureIndex
-            // Update the left or right count for one bin.
-            val aggShift = 2 * numBins * numFeatures * nodeIndex
-            val aggIndex = aggShift + 2 * featureIndex * numBins + arr(arrIndex).toInt * 2
-            label match {
-              case 0.0 => agg(aggIndex) = agg(aggIndex) + 1
-              case 1.0 => agg(aggIndex + 1) = agg(aggIndex + 1) + 1
+            updateBinForOrderedFeature(arr, agg, nodeIndex, label, featureIndex)
+            featureIndex += 1
+          }
+        }
+        nodeIndex += 1
+      }
+    }
+
+    /**
+     * Helper for binSeqOp.
+     *
+     * @param arr  Bin mapping from findBinsForLevel. arr(0) stores the class label.
+     *             Array of size 1 + (numFeatures * numNodes).
+     *             For ordered features,
+     *               arr(1 + featureIndex + nodeIndex * numFeatures) = bin index.
+     *             For unordered features,
+     *               arr(1 + featureIndex + nodeIndex * numFeatures) = feature value (category).
+     * @param agg  Array storing aggregate calculation.
+     *             For ordered features, this is of size:
+     *               numClasses * numBins * numFeatures * numNodes.
+     *             For unordered features, this is of size:
+     *               2 * numClasses * numBins * numFeatures * numNodes.
+     */
+    def multiclassWithCategoricalBinSeqOp(arr: Array[Double], agg: Array[Double]): Unit = {
+      // Iterate over all nodes.
+      var nodeIndex = 0
+      while (nodeIndex < numNodes) {
+        // Check whether the instance was valid for this nodeIndex.
+        val validSignalIndex = 1 + numFeatures * nodeIndex
+        val isSampleValidForNode = arr(validSignalIndex) != InvalidBinIndex
+        if (isSampleValidForNode) {
+          val rightChildShift = numClasses * numBins * numFeatures * numNodes
+          // actual class label
+          val label = arr(0)
+          // Iterate over all features.
+          var featureIndex = 0
+          while (featureIndex < numFeatures) {
+            val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
+            if (isFeatureContinuous) {
+              updateBinForOrderedFeature(arr, agg, nodeIndex, label, featureIndex)
+            } else {
+              val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
+              val isSpaceSufficientForAllCategoricalSplits
+                = numBins > math.pow(2, featureCategories.toInt - 1) - 1
+              if (isSpaceSufficientForAllCategoricalSplits) {
+                updateBinForUnorderedFeature(nodeIndex, featureIndex, arr, label, agg,
+                  rightChildShift)
+              } else {
+                updateBinForOrderedFeature(arr, agg, nodeIndex, label, featureIndex)
+              }
             }
             featureIndex += 1
           }
@@ -577,16 +781,17 @@ object DecisionTree extends Serializable with Logging {
     }
 
     /**
-     * Performs a sequential aggregation over a partition for regression. For l nodes, k features,
+     * Performs a sequential aggregation over a partition for regression.
+     * For l nodes, k features,
      * the count, sum, sum of squares of one of the p bins is incremented.
      *
-     * @param agg Array[Double] storing aggregate calculation of size
-     *            3 * numSplits * numFeatures * numNodes for classification
-     * @param arr Array[Double] of size 1 + (numFeatures * numNodes)
-     * @return Array[Double] storing aggregate calculation of size
-     *         3 * numSplits * numFeatures * numNodes for regression
+     * @param agg Array storing aggregate calculation, updated by this function.
+     *            Size: 3 * numBins * numFeatures * numNodes
+     * @param arr Bin mapping from findBinsForLevel.
+     *             Array of size 1 + (numFeatures * numNodes).
+     * @return agg
      */
-    def regressionBinSeqOp(arr: Array[Double], agg: Array[Double]) {
+    def regressionBinSeqOp(arr: Array[Double], agg: Array[Double]): Unit = {
       // Iterate over all nodes.
       var nodeIndex = 0
       while (nodeIndex < numNodes) {
@@ -617,20 +822,39 @@ object DecisionTree extends Serializable with Logging {
 
     /**
      * Performs a sequential aggregation over a partition.
+     * For l nodes, k features,
+     *   For classification:
+     *     Either the left count or the right count of one of the bins is
+     *     incremented based upon whether the feature is classified as 0 or 1.
+     *   For regression:
+     *     The count, sum, sum of squares of one of the bins is incremented.
+     *
+     * @param agg Array storing aggregate calculation, updated by this function.
+     *            Size for classification:
+     *              numClasses * numBins * numFeatures * numNodes for ordered features, or
+     *              2 * numClasses * numBins * numFeatures * numNodes for unordered features.
+     *            Size for regression:
+     *              3 * numBins * numFeatures * numNodes.
+     * @param arr  Bin mapping from findBinsForLevel.
+     *             Array of size 1 + (numFeatures * numNodes).
+     * @return  agg
      */
     def binSeqOp(agg: Array[Double], arr: Array[Double]): Array[Double] = {
       strategy.algo match {
-        case Classification => classificationBinSeqOp(arr, agg)
+        case Classification =>
+          if(isMulticlassClassificationWithCategoricalFeatures) {
+            multiclassWithCategoricalBinSeqOp(arr, agg)
+          } else {
+            binaryOrNotCategoricalBinSeqOp(arr, agg)
+          }
         case Regression => regressionBinSeqOp(arr, agg)
       }
       agg
     }
 
     // Calculate bin aggregate length for classification or regression.
-    val binAggregateLength = strategy.algo match {
-      case Classification => 2 * numBins * numFeatures * numNodes
-      case Regression =>  3 * numBins * numFeatures * numNodes
-    }
+    val binAggregateLength = numNodes * getElementsPerNode(numFeatures, numBins, numClasses,
+        isMulticlassClassificationWithCategoricalFeatures, strategy.algo)
     logDebug("binAggregateLength = " + binAggregateLength)
 
     /**
@@ -649,9 +873,6 @@ object DecisionTree extends Serializable with Logging {
       combinedAggregate
     }
 
-    // Find feature bins for all nodes at a level.
-    val binMappedRDD = input.map(x => findBinsForLevel(x))
-
     // Calculate bin aggregates.
     val binAggregates = {
       binMappedRDD.aggregate(Array.fill[Double](binAggregateLength)(0))(binSeqOp,binCombOp)
@@ -668,62 +889,82 @@ object DecisionTree extends Serializable with Logging {
      * @return information gain and statistics for all splits
      */
     def calculateGainForSplit(
-        leftNodeAgg: Array[Array[Double]],
+        leftNodeAgg: Array[Array[Array[Double]]],
         featureIndex: Int,
         splitIndex: Int,
-        rightNodeAgg: Array[Array[Double]],
+        rightNodeAgg: Array[Array[Array[Double]]],
         topImpurity: Double): InformationGainStats = {
       strategy.algo match {
         case Classification =>
-          val left0Count = leftNodeAgg(featureIndex)(2 * splitIndex)
-          val left1Count = leftNodeAgg(featureIndex)(2 * splitIndex + 1)
-          val leftCount = left0Count + left1Count
-
-          val right0Count = rightNodeAgg(featureIndex)(2 * splitIndex)
-          val right1Count = rightNodeAgg(featureIndex)(2 * splitIndex + 1)
-          val rightCount = right0Count + right1Count
+          val leftCounts: Array[Double] = leftNodeAgg(featureIndex)(splitIndex)
+          val rightCounts: Array[Double] = rightNodeAgg(featureIndex)(splitIndex)
+          val leftTotalCount = leftCounts.sum
+          val rightTotalCount = rightCounts.sum
 
           val impurity = {
             if (level > 0) {
               topImpurity
             } else {
               // Calculate impurity for root node.
-              strategy.impurity.calculate(left0Count + right0Count, left1Count + right1Count)
+              val rootNodeCounts = new Array[Double](numClasses)
+              var classIndex = 0
+              while (classIndex < numClasses) {
+                rootNodeCounts(classIndex) = leftCounts(classIndex) + rightCounts(classIndex)
+                classIndex += 1
+              }
+              strategy.impurity.calculate(rootNodeCounts, leftTotalCount + rightTotalCount)
             }
           }
 
-          if (leftCount == 0) {
-            return new InformationGainStats(0, topImpurity, Double.MinValue, topImpurity,1)
-          }
-          if (rightCount == 0) {
-            return new InformationGainStats(0, topImpurity, topImpurity, Double.MinValue,0)
+          val totalCount = leftTotalCount + rightTotalCount
+          if (totalCount == 0) {
+            // Return arbitrary prediction.
+            return new InformationGainStats(0, topImpurity, topImpurity, topImpurity, 0)
           }
 
-          val leftImpurity = strategy.impurity.calculate(left0Count, left1Count)
-          val rightImpurity = strategy.impurity.calculate(right0Count, right1Count)
-
-          val leftWeight = leftCount.toDouble / (leftCount + rightCount)
-          val rightWeight = rightCount.toDouble / (leftCount + rightCount)
+          // Sum of count for each label
+          val leftRightCounts: Array[Double] =
+            leftCounts.zip(rightCounts).map { case (leftCount, rightCount) =>
+              leftCount + rightCount
+            }
 
-          val gain = {
-            if (level > 0) {
-              impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
-            } else {
-              impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
+          def indexOfLargestArrayElement(array: Array[Double]): Int = {
+            val result = array.foldLeft(-1, Double.MinValue, 0) {
+              case ((maxIndex, maxValue, currentIndex), currentValue) =>
+                if(currentValue > maxValue) (currentIndex, currentValue, currentIndex + 1)
+                else (maxIndex, maxValue, currentIndex + 1)
             }
+            if (result._1 < 0) 0 else result._1
           }
 
-          val predict = (left1Count + right1Count) / (leftCount + rightCount)
+          val predict = indexOfLargestArrayElement(leftRightCounts)
+          val prob = leftRightCounts(predict) / totalCount
 
-          new InformationGainStats(gain, impurity, leftImpurity, rightImpurity, predict)
+          val leftImpurity = if (leftTotalCount == 0) {
+            topImpurity
+          } else {
+            strategy.impurity.calculate(leftCounts, leftTotalCount)
+          }
+          val rightImpurity = if (rightTotalCount == 0) {
+            topImpurity
+          } else {
+            strategy.impurity.calculate(rightCounts, rightTotalCount)
+          }
+
+          val leftWeight = leftTotalCount / totalCount
+          val rightWeight = rightTotalCount / totalCount
+
+          val gain = impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
+
+          new InformationGainStats(gain, impurity, leftImpurity, rightImpurity, predict, prob)
         case Regression =>
-          val leftCount = leftNodeAgg(featureIndex)(3 * splitIndex)
-          val leftSum = leftNodeAgg(featureIndex)(3 * splitIndex + 1)
-          val leftSumSquares = leftNodeAgg(featureIndex)(3 * splitIndex + 2)
+          val leftCount = leftNodeAgg(featureIndex)(splitIndex)(0)
+          val leftSum = leftNodeAgg(featureIndex)(splitIndex)(1)
+          val leftSumSquares = leftNodeAgg(featureIndex)(splitIndex)(2)
 
-          val rightCount = rightNodeAgg(featureIndex)(3 * splitIndex)
-          val rightSum = rightNodeAgg(featureIndex)(3 * splitIndex + 1)
-          val rightSumSquares = rightNodeAgg(featureIndex)(3 * splitIndex + 2)
+          val rightCount = rightNodeAgg(featureIndex)(splitIndex)(0)
+          val rightSum = rightNodeAgg(featureIndex)(splitIndex)(1)
+          val rightSumSquares = rightNodeAgg(featureIndex)(splitIndex)(2)
 
           val impurity = {
             if (level > 0) {
@@ -767,105 +1008,163 @@ object DecisionTree extends Serializable with Logging {
 
     /**
      * Extracts left and right split aggregates.
-     * @param binData Array[Double] of size 2*numFeatures*numSplits
-     * @return (leftNodeAgg, rightNodeAgg) tuple of type (Array[Double],
-     *         Array[Double]) where each array is of size(numFeature,2*(numSplits-1))
+     * @param binData Aggregate array slice from getBinDataForNode.
+     *                For classification:
+     *                  For unordered features, this is leftChildData ++ rightChildData,
+     *                    each of which is indexed by (feature, split/bin, class),
+     *                    with class being the least significant bit.
+     *                  For ordered features, this is of size numClasses * numBins * numFeatures.
+     *                For regression:
+     *                  This is of size 2 * numFeatures * numBins.
+     * @return (leftNodeAgg, rightNodeAgg) pair of arrays.
+     *         For classification, each array is of size (numFeatures, (numBins - 1), numClasses).
+     *         For regression, each array is of size (numFeatures, (numBins - 1), 3).
+     *
      */
     def extractLeftRightNodeAggregates(
-        binData: Array[Double]): (Array[Array[Double]], Array[Array[Double]]) = {
+        binData: Array[Double]): (Array[Array[Array[Double]]], Array[Array[Array[Double]]]) = {
+
+
+      def findAggForOrderedFeatureClassification(
+          leftNodeAgg: Array[Array[Array[Double]]],
+          rightNodeAgg: Array[Array[Array[Double]]],
+          featureIndex: Int) {
+
+        // shift for this featureIndex
+        val shift = numClasses * featureIndex * numBins
+
+        var classIndex = 0
+        while (classIndex < numClasses) {
+          // left node aggregate for the lowest split
+          leftNodeAgg(featureIndex)(0)(classIndex) = binData(shift + classIndex)
+          // right node aggregate for the highest split
+          rightNodeAgg(featureIndex)(numBins - 2)(classIndex)
+            = binData(shift + (numClasses * (numBins - 1)) + classIndex)
+          classIndex += 1
+        }
+
+        // Iterate over all splits.
+        var splitIndex = 1
+        while (splitIndex < numBins - 1) {
+          // calculating left node aggregate for a split as a sum of left node aggregate of a
+          // lower split and the left bin aggregate of a bin where the split is a high split
+          var innerClassIndex = 0
+          while (innerClassIndex < numClasses) {
+            leftNodeAgg(featureIndex)(splitIndex)(innerClassIndex)
+              = binData(shift + numClasses * splitIndex + innerClassIndex) +
+                leftNodeAgg(featureIndex)(splitIndex - 1)(innerClassIndex)
+            rightNodeAgg(featureIndex)(numBins - 2 - splitIndex)(innerClassIndex) =
+              binData(shift + (numClasses * (numBins - 1 - splitIndex) + innerClassIndex)) +
+                rightNodeAgg(featureIndex)(numBins - 1 - splitIndex)(innerClassIndex)
+            innerClassIndex += 1
+          }
+          splitIndex += 1
+        }
+      }
+
+      /**
+       * Reshape binData for this feature.
+       * Indexes binData as (feature, split, class) with class as the least significant bit.
+       * @param leftNodeAgg   leftNodeAgg(featureIndex)(splitIndex)(classIndex) = aggregate value
+       */
+      def findAggForUnorderedFeatureClassification(
+          leftNodeAgg: Array[Array[Array[Double]]],
+          rightNodeAgg: Array[Array[Array[Double]]],
+          featureIndex: Int) {
+
+        val rightChildShift = numClasses * numBins * numFeatures
+        var splitIndex = 0
+        while (splitIndex < numBins - 1) {
+          var classIndex = 0
+          while (classIndex < numClasses) {
+            // shift for this featureIndex
+            val shift = numClasses * featureIndex * numBins + splitIndex * numClasses
+            val leftBinValue = binData(shift + classIndex)
+            val rightBinValue = binData(rightChildShift + shift + classIndex)
+            leftNodeAgg(featureIndex)(splitIndex)(classIndex) = leftBinValue
+            rightNodeAgg(featureIndex)(splitIndex)(classIndex) = rightBinValue
+            classIndex += 1
+          }
+          splitIndex += 1
+        }
+      }
+
+      def findAggForRegression(
+          leftNodeAgg: Array[Array[Array[Double]]],
+          rightNodeAgg: Array[Array[Array[Double]]],
+          featureIndex: Int) {
+
+        // shift for this featureIndex
+        val shift = 3 * featureIndex * numBins
+        // left node aggregate for the lowest split
+        leftNodeAgg(featureIndex)(0)(0) = binData(shift + 0)
+        leftNodeAgg(featureIndex)(0)(1) = binData(shift + 1)
+        leftNodeAgg(featureIndex)(0)(2) = binData(shift + 2)
+
+        // right node aggregate for the highest split
+        rightNodeAgg(featureIndex)(numBins - 2)(0) =
+          binData(shift + (3 * (numBins - 1)))
+        rightNodeAgg(featureIndex)(numBins - 2)(1) =
+          binData(shift + (3 * (numBins - 1)) + 1)
+        rightNodeAgg(featureIndex)(numBins - 2)(2) =
+          binData(shift + (3 * (numBins - 1)) + 2)
+
+        // Iterate over all splits.
+        var splitIndex = 1
+        while (splitIndex < numBins - 1) {
+          var i = 0 // index for regression histograms
+          while (i < 3) { // count, sum, sum^2
+            // calculating left node aggregate for a split as a sum of left node aggregate of a
+            // lower split and the left bin aggregate of a bin where the split is a high split
+            leftNodeAgg(featureIndex)(splitIndex)(i) = binData(shift + 3 * splitIndex + i) +
+              leftNodeAgg(featureIndex)(splitIndex - 1)(i)
+            // calculating right node aggregate for a split as a sum of right node aggregate of a
+            // higher split and the right bin aggregate of a bin where the split is a low split
+            rightNodeAgg(featureIndex)(numBins - 2 - splitIndex)(i) =
+              binData(shift + (3 * (numBins - 1 - splitIndex) + i)) +
+                rightNodeAgg(featureIndex)(numBins - 1 - splitIndex)(i)
+            i += 1
+          }
+          splitIndex += 1
+        }
+      }
+
       strategy.algo match {
         case Classification =>
           // Initialize left and right split aggregates.
-          val leftNodeAgg = Array.ofDim[Double](numFeatures, 2 * (numBins - 1))
-          val rightNodeAgg = Array.ofDim[Double](numFeatures, 2 * (numBins - 1))
-          // Iterate over all features.
+          val leftNodeAgg = Array.ofDim[Double](numFeatures, numBins - 1, numClasses)
+          val rightNodeAgg = Array.ofDim[Double](numFeatures, numBins - 1, numClasses)
           var featureIndex = 0
           while (featureIndex < numFeatures) {
-            // shift for this featureIndex
-            val shift = 2 * featureIndex * numBins
-
-            // left node aggregate for the lowest split
-            leftNodeAgg(featureIndex)(0) = binData(shift + 0)
-            leftNodeAgg(featureIndex)(1) = binData(shift + 1)
-
-            // right node aggregate for the highest split
-            rightNodeAgg(featureIndex)(2 * (numBins - 2))
-              = binData(shift + (2 * (numBins - 1)))
-            rightNodeAgg(featureIndex)(2 * (numBins - 2) + 1)
-              = binData(shift + (2 * (numBins - 1)) + 1)
-
-            // Iterate over all splits.
-            var splitIndex = 1
-            while (splitIndex < numBins - 1) {
-              // calculating left node aggregate for a split as a sum of left node aggregate of a
-              // lower split and the left bin aggregate of a bin where the split is a high split
-              leftNodeAgg(featureIndex)(2 * splitIndex) = binData(shift + 2 * splitIndex) +
-                leftNodeAgg(featureIndex)(2 * splitIndex - 2)
-              leftNodeAgg(featureIndex)(2 * splitIndex + 1) = binData(shift + 2 * splitIndex + 1) +
-                leftNodeAgg(featureIndex)(2 * splitIndex - 2 + 1)
-
-              // calculating right node aggregate for a split as a sum of right node aggregate of a
-              // higher split and the right bin aggregate of a bin where the split is a low split
-              rightNodeAgg(featureIndex)(2 * (numBins - 2 - splitIndex)) =
-                binData(shift + (2 *(numBins - 2 - splitIndex))) +
-                rightNodeAgg(featureIndex)(2 * (numBins - 1 - splitIndex))
-              rightNodeAgg(featureIndex)(2 * (numBins - 2 - splitIndex) + 1) =
-                binData(shift + (2* (numBins - 2 - splitIndex) + 1)) +
-                  rightNodeAgg(featureIndex)(2 * (numBins - 1 - splitIndex) + 1)
-
-              splitIndex += 1
+            if (isMulticlassClassificationWithCategoricalFeatures) {
+              val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
+              if (isFeatureContinuous) {
+                findAggForOrderedFeatureClassification(leftNodeAgg, rightNodeAgg, featureIndex)
+              } else {
+                val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
+                val isSpaceSufficientForAllCategoricalSplits
+                  = numBins > math.pow(2, featureCategories.toInt - 1) - 1
+                if (isSpaceSufficientForAllCategoricalSplits) {
+                  findAggForUnorderedFeatureClassification(leftNodeAgg, rightNodeAgg, featureIndex)
+                } else {
+                  findAggForOrderedFeatureClassification(leftNodeAgg, rightNodeAgg, featureIndex)
+                }
+              }
+            } else {
+              findAggForOrderedFeatureClassification(leftNodeAgg, rightNodeAgg, featureIndex)
             }
             featureIndex += 1
           }
+
           (leftNodeAgg, rightNodeAgg)
         case Regression =>
           // Initialize left and right split aggregates.
-          val leftNodeAgg = Array.ofDim[Double](numFeatures, 3 * (numBins - 1))
-          val rightNodeAgg = Array.ofDim[Double](numFeatures, 3 * (numBins - 1))
+          val leftNodeAgg = Array.ofDim[Double](numFeatures, numBins - 1, 3)
+          val rightNodeAgg = Array.ofDim[Double](numFeatures, numBins - 1, 3)
           // Iterate over all features.
           var featureIndex = 0
           while (featureIndex < numFeatures) {
-            // shift for this featureIndex
-            val shift = 3 * featureIndex * numBins
-            // left node aggregate for the lowest split
-            leftNodeAgg(featureIndex)(0) = binData(shift + 0)
-            leftNodeAgg(featureIndex)(1) = binData(shift + 1)
-            leftNodeAgg(featureIndex)(2) = binData(shift + 2)
-
-            // right node aggregate for the highest split
-            rightNodeAgg(featureIndex)(3 * (numBins - 2)) =
-              binData(shift + (3 * (numBins - 1)))
-            rightNodeAgg(featureIndex)(3 * (numBins - 2) + 1) =
-              binData(shift + (3 * (numBins - 1)) + 1)
-            rightNodeAgg(featureIndex)(3 * (numBins - 2) + 2) =
-              binData(shift + (3 * (numBins - 1)) + 2)
-
-            // Iterate over all splits.
-            var splitIndex = 1
-            while (splitIndex < numBins - 1) {
-              // calculating left node aggregate for a split as a sum of left node aggregate of a
-              // lower split and the left bin aggregate of a bin where the split is a high split
-              leftNodeAgg(featureIndex)(3 * splitIndex) = binData(shift + 3 * splitIndex) +
-                leftNodeAgg(featureIndex)(3 * splitIndex - 3)
-              leftNodeAgg(featureIndex)(3 * splitIndex + 1) = binData(shift + 3 * splitIndex + 1) +
-                leftNodeAgg(featureIndex)(3 * splitIndex - 3 + 1)
-              leftNodeAgg(featureIndex)(3 * splitIndex + 2) = binData(shift + 3 * splitIndex + 2) +
-                leftNodeAgg(featureIndex)(3 * splitIndex - 3 + 2)
-
-              // calculating right node aggregate for a split as a sum of right node aggregate of a
-              // higher split and the right bin aggregate of a bin where the split is a low split
-              rightNodeAgg(featureIndex)(3 * (numBins - 2 - splitIndex)) =
-                binData(shift + (3 * (numBins - 2 - splitIndex))) +
-                  rightNodeAgg(featureIndex)(3 * (numBins - 1 - splitIndex))
-              rightNodeAgg(featureIndex)(3 * (numBins - 2 - splitIndex) + 1) =
-                binData(shift + (3 * (numBins - 2 - splitIndex) + 1)) +
-                  rightNodeAgg(featureIndex)(3 * (numBins - 1 - splitIndex) + 1)
-              rightNodeAgg(featureIndex)(3 * (numBins - 2 - splitIndex) + 2) =
-                binData(shift + (3 * (numBins - 2 - splitIndex) + 2)) +
-                  rightNodeAgg(featureIndex)(3 * (numBins - 1 - splitIndex) + 2)
-
-              splitIndex += 1
-            }
+            findAggForRegression(leftNodeAgg, rightNodeAgg, featureIndex)
             featureIndex += 1
           }
           (leftNodeAgg, rightNodeAgg)
@@ -876,8 +1175,8 @@ object DecisionTree extends Serializable with Logging {
      * Calculates information gain for all nodes splits.
      */
     def calculateGainsForAllNodeSplits(
-        leftNodeAgg: Array[Array[Double]],
-        rightNodeAgg: Array[Array[Double]],
+        leftNodeAgg: Array[Array[Array[Double]]],
+        rightNodeAgg: Array[Array[Array[Double]]],
         nodeImpurity: Double): Array[Array[InformationGainStats]] = {
       val gains = Array.ofDim[InformationGainStats](numFeatures, numBins - 1)
 
@@ -892,7 +1191,7 @@ object DecisionTree extends Serializable with Logging {
 
     /**
      * Find the best split for a node.
-     * @param binData Array[Double] of size 2 * numSplits * numFeatures
+     * @param binData Bin data slice for this node, given by getBinDataForNode.
      * @param nodeImpurity impurity of the top node
      * @return tuple of split and information gain
      */
@@ -918,7 +1217,22 @@ object DecisionTree extends Serializable with Logging {
         while (featureIndex < numFeatures) {
           // Iterate over all splits.
           var splitIndex = 0
-          while (splitIndex < numBins - 1) {
+          val maxSplitIndex: Double = {
+            val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
+            if (isFeatureContinuous) {
+              numBins - 1
+            } else { // Categorical feature
+              val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
+              val isSpaceSufficientForAllCategoricalSplits
+                = numBins > math.pow(2, featureCategories.toInt - 1) - 1
+              if (isMulticlassClassification && isSpaceSufficientForAllCategoricalSplits) {
+                math.pow(2.0, featureCategories - 1).toInt - 1
+              } else { // Binary classification
+                featureCategories
+              }
+            }
+          }
+          while (splitIndex < maxSplitIndex) {
             val gainStats = gains(featureIndex)(splitIndex)
             if (gainStats.gain > bestGainStats.gain) {
               bestGainStats = gainStats
@@ -932,8 +1246,8 @@ object DecisionTree extends Serializable with Logging {
         (bestFeatureIndex, bestSplitIndex, bestGainStats)
       }
 
+      logDebug("best split = " + splits(bestFeatureIndex)(bestSplitIndex))
       logDebug("best split bin = " + bins(bestFeatureIndex)(bestSplitIndex))
-      logDebug("best split bin = " + splits(bestFeatureIndex)(bestSplitIndex))
 
       (splits(bestFeatureIndex)(bestSplitIndex), gainStats)
     }
@@ -944,9 +1258,23 @@ object DecisionTree extends Serializable with Logging {
     def getBinDataForNode(node: Int): Array[Double] = {
       strategy.algo match {
         case Classification =>
-          val shift = 2 * node * numBins * numFeatures
-          val binsForNode = binAggregates.slice(shift, shift + 2 * numBins * numFeatures)
-          binsForNode
+          if (isMulticlassClassificationWithCategoricalFeatures) {
+            val shift = numClasses * node * numBins * numFeatures
+            val rightChildShift = numClasses * numBins * numFeatures * numNodes
+            val binsForNode = {
+              val leftChildData
+                = binAggregates.slice(shift, shift + numClasses * numBins * numFeatures)
+              val rightChildData
+              = binAggregates.slice(rightChildShift + shift,
+                rightChildShift + shift + numClasses * numBins * numFeatures)
+              leftChildData ++ rightChildData
+            }
+            binsForNode
+          } else {
+            val shift = numClasses * node * numBins * numFeatures
+            val binsForNode = binAggregates.slice(shift, shift + numClasses * numBins * numFeatures)
+            binsForNode
+          }
         case Regression =>
           val shift = 3 * node * numBins * numFeatures
           val binsForNode = binAggregates.slice(shift, shift + 3 * numBins * numFeatures)
@@ -963,27 +1291,70 @@ object DecisionTree extends Serializable with Logging {
       val binsForNode: Array[Double] = getBinDataForNode(node)
       logDebug("nodeImpurityIndex = " + nodeImpurityIndex)
       val parentNodeImpurity = parentImpurities(nodeImpurityIndex)
-      logDebug("node impurity = " + parentNodeImpurity)
+      logDebug("parent node impurity = " + parentNodeImpurity)
       bestSplits(node) = binsToBestSplit(binsForNode, parentNodeImpurity)
       node += 1
     }
-
     bestSplits
   }
 
   /**
-   * Returns split and bins for decision tree calculation.
-   * @param input RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] used as training data
-   *              for DecisionTree
+   * Get the number of values to be stored per node in the bin aggregates.
+   *
+   * @param numBins  Number of bins = 1 + number of possible splits.
+   */
+  private def getElementsPerNode(
+      numFeatures: Int,
+      numBins: Int,
+      numClasses: Int,
+      isMulticlassClassificationWithCategoricalFeatures: Boolean,
+      algo: Algo): Int = {
+    algo match {
+      case Classification =>
+        if (isMulticlassClassificationWithCategoricalFeatures) {
+          2 * numClasses * numBins * numFeatures
+        } else {
+          numClasses * numBins * numFeatures
+        }
+      case Regression => 3 * numBins * numFeatures
+    }
+  }
+
+  /**
+   * Returns splits and bins for decision tree calculation.
+   * Continuous and categorical features are handled differently.
+   *
+   * Continuous features:
+   *   For each feature, there are numBins - 1 possible splits representing the possible binary
+   *   decisions at each node in the tree.
+   *
+   * Categorical features:
+   *   For each feature, there is 1 bin per split.
+   *   Splits and bins are handled in 2 ways:
+   *   (a) For multiclass classification with a low-arity feature
+   *       (i.e., if isMulticlass && isSpaceSufficientForAllCategoricalSplits),
+   *       the feature is split based on subsets of categories.
+   *       There are 2^(maxFeatureValue - 1) - 1 splits.
+   *   (b) For regression and binary classification,
+   *       and for multiclass classification with a high-arity feature,
+   *       there is one split per category.
+
+   * Categorical case (a) features are called unordered features.
+   * Other cases are called ordered features.
+   *
+   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
    * @param strategy [[org.apache.spark.mllib.tree.configuration.Strategy]] instance containing
-   *                parameters for construction the DecisionTree
-   * @return a tuple of (splits,bins) where splits is an Array of [org.apache.spark.mllib.tree
-   *         .model.Split] of size (numFeatures, numSplits-1) and bins is an Array of [org.apache
-   *         .spark.mllib.tree.model.Bin] of size (numFeatures, numSplits1)
+   *                 parameters for construction the DecisionTree
+   * @return A tuple of (splits,bins).
+   *         Splits is an Array of [[org.apache.spark.mllib.tree.model.Split]]
+   *          of size (numFeatures, numBins - 1).
+   *         Bins is an Array of [[org.apache.spark.mllib.tree.model.Bin]]
+   *          of size (numFeatures, numBins).
    */
   protected[tree] def findSplitsBins(
       input: RDD[LabeledPoint],
       strategy: Strategy): (Array[Array[Split]], Array[Array[Bin]]) = {
+
     val count = input.count()
 
     // Find the number of features by looking at the first sample
@@ -992,24 +1363,31 @@ object DecisionTree extends Serializable with Logging {
     val maxBins = strategy.maxBins
     val numBins = if (maxBins <= count) maxBins else count.toInt
     logDebug("numBins = " + numBins)
+    val isMulticlassClassification = strategy.isMulticlassClassification
+    logDebug("isMulticlassClassification = " + isMulticlassClassification)
+
 
     /*
-     * TODO: Add a require statement ensuring #bins is always greater than the categories.
+     * Ensure #bins is always greater than the categories. For multiclass classification,
+     * #bins should be greater than 2^(maxCategories - 1) - 1.
      * It's a limitation of the current implementation but a reasonable trade-off since features
      * with large number of categories get favored over continuous features.
      */
     if (strategy.categoricalFeaturesInfo.size > 0) {
       val maxCategoriesForFeatures = strategy.categoricalFeaturesInfo.maxBy(_._2)._2
-      require(numBins >= maxCategoriesForFeatures)
+      require(numBins > maxCategoriesForFeatures, "numBins should be greater than max categories " +
+        "in categorical features")
     }
 
+
     // Calculate the number of sample for approximate quantile calculation.
     val requiredSamples = numBins*numBins
     val fraction = if (requiredSamples < count) requiredSamples.toDouble / count else 1.0
     logDebug("fraction of data used for calculating quantiles = " + fraction)
 
     // sampled input for RDD calculation
-    val sampledInput = input.sample(false, fraction, new XORShiftRandom().nextInt()).collect()
+    val sampledInput =
+      input.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect()
     val numSamples = sampledInput.length
 
     val stride: Double = numSamples.toDouble / numBins
@@ -1024,7 +1402,7 @@ object DecisionTree extends Serializable with Logging {
 
         // Iterate over all features.
         var featureIndex = 0
-        while (featureIndex < numFeatures){
+        while (featureIndex < numFeatures) {
           // Check whether the feature is continuous.
           val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
           if (isFeatureContinuous) {
@@ -1032,52 +1410,106 @@ object DecisionTree extends Serializable with Logging {
             val stride: Double = numSamples.toDouble / numBins
             logDebug("stride = " + stride)
             for (index <- 0 until numBins - 1) {
-              val sampleIndex = (index + 1) * stride.toInt
-              val split = new Split(featureIndex, featureSamples(sampleIndex), Continuous, List())
+              val sampleIndex = index * stride.toInt
+              // Set threshold halfway in between 2 samples.
+              val threshold = (featureSamples(sampleIndex) + featureSamples(sampleIndex + 1)) / 2.0
+              val split = new Split(featureIndex, threshold, Continuous, List())
               splits(featureIndex)(index) = split
             }
-          } else {
-            val maxFeatureValue = strategy.categoricalFeaturesInfo(featureIndex)
-            require(maxFeatureValue < numBins, "number of categories should be less than number " +
-              "of bins")
-
-            // For categorical variables, each bin is a category. The bins are sorted and they
-            // are ordered by calculating the centroid of their corresponding labels.
-            val centroidForCategories =
-              sampledInput.map(lp => (lp.features(featureIndex),lp.label))
-                .groupBy(_._1)
-                .mapValues(x => x.map(_._2).sum / x.map(_._1).length)
-
-            // Check for missing categorical variables and putting them last in the sorted list.
-            val fullCentroidForCategories = scala.collection.mutable.Map[Double,Double]()
-            for (i <- 0 until maxFeatureValue) {
-              if (centroidForCategories.contains(i)) {
-                fullCentroidForCategories(i) = centroidForCategories(i)
-              } else {
-                fullCentroidForCategories(i) = Double.MaxValue
-              }
-            }
-
-            // bins sorted by centroids
-            val categoriesSortedByCentroid = fullCentroidForCategories.toList.sortBy(_._2)
-
-            logDebug("centriod for categorical variable = " + categoriesSortedByCentroid)
-
-            var categoriesForSplit = List[Double]()
-            categoriesSortedByCentroid.iterator.zipWithIndex.foreach {
-              case ((key, value), index) =>
-                categoriesForSplit = key :: categoriesForSplit
-                splits(featureIndex)(index) = new Split(featureIndex, Double.MinValue, Categorical,
-                  categoriesForSplit)
+          } else { // Categorical feature
+            val featureCategories = strategy.categoricalFeaturesInfo(featureIndex)
+            val isSpaceSufficientForAllCategoricalSplits
+              = numBins > math.pow(2, featureCategories.toInt - 1) - 1
+
+            // Use different bin/split calculation strategy for categorical features in multiclass
+            // classification that satisfy the space constraint.
+            val isUnorderedFeature =
+              isMulticlassClassification && isSpaceSufficientForAllCategoricalSplits
+            if (isUnorderedFeature) {
+              // 2^(maxFeatureValue- 1) - 1 combinations
+              var index = 0
+              while (index < math.pow(2.0, featureCategories - 1).toInt - 1) {
+                val categories: List[Double]
+                  = extractMultiClassCategories(index + 1, featureCategories)
+                splits(featureIndex)(index)
+                  = new Split(featureIndex, Double.MinValue, Categorical, categories)
                 bins(featureIndex)(index) = {
                   if (index == 0) {
-                    new Bin(new DummyCategoricalSplit(featureIndex, Categorical),
-                      splits(featureIndex)(0), Categorical, key)
+                    new Bin(
+                      new DummyCategoricalSplit(featureIndex, Categorical),
+                      splits(featureIndex)(0),
+                      Categorical,
+                      Double.MinValue)
                   } else {
-                    new Bin(splits(featureIndex)(index-1), splits(featureIndex)(index),
-                      Categorical, key)
+                    new Bin(
+                      splits(featureIndex)(index - 1),
+                      splits(featureIndex)(index),
+                      Categorical,
+                      Double.MinValue)
                   }
                 }
+                index += 1
+              }
+            } else { // ordered feature
+              /* For a given categorical feature, use a subsample of the data
+               * to choose how to arrange possible splits.
+               * This examines each category and computes a centroid.
+               * These centroids are later used to sort the possible splits.
+               * centroidForCategories is a mapping: category (for the given feature) --> centroid
+               */
+              val centroidForCategories = {
+                if (isMulticlassClassification) {
+                  // For categorical variables in multiclass classification,
+                  // each bin is a category. The bins are sorted and they
+                  // are ordered by calculating the impurity of their corresponding labels.
+                  sampledInput.map(lp => (lp.features(featureIndex), lp.label))
+                   .groupBy(_._1)
+                   .mapValues(x => x.groupBy(_._2).mapValues(x => x.size.toDouble))
+                   .map(x => (x._1, x._2.values.toArray))
+                   .map(x => (x._1, strategy.impurity.calculate(x._2, x._2.sum)))
+                } else { // regression or binary classification
+                  // For categorical variables in regression and binary classification,
+                  // each bin is a category. The bins are sorted and they
+                  // are ordered by calculating the centroid of their corresponding labels.
+                  sampledInput.map(lp => (lp.features(featureIndex), lp.label))
+                    .groupBy(_._1)
+                    .mapValues(x => x.map(_._2).sum / x.map(_._1).length)
+                }
+              }
+
+              logDebug("centroid for categories = " + centroidForCategories.mkString(","))
+
+              // Check for missing categorical variables and putting them last in the sorted list.
+              val fullCentroidForCategories = scala.collection.mutable.Map[Double,Double]()
+              for (i <- 0 until featureCategories) {
+                if (centroidForCategories.contains(i)) {
+                  fullCentroidForCategories(i) = centroidForCategories(i)
+                } else {
+                  fullCentroidForCategories(i) = Double.MaxValue
+                }
+              }
+
+              // bins sorted by centroids
+              val categoriesSortedByCentroid = fullCentroidForCategories.toList.sortBy(_._2)
+
+              logDebug("centroid for categorical variable = " + categoriesSortedByCentroid)
+
+              var categoriesForSplit = List[Double]()
+              categoriesSortedByCentroid.iterator.zipWithIndex.foreach {
+                case ((key, value), index) =>
+                  categoriesForSplit = key :: categoriesForSplit
+                  splits(featureIndex)(index) = new Split(featureIndex, Double.MinValue,
+                    Categorical, categoriesForSplit)
+                  bins(featureIndex)(index) = {
+                    if (index == 0) {
+                      new Bin(new DummyCategoricalSplit(featureIndex, Categorical),
+                        splits(featureIndex)(0), Categorical, key)
+                    } else {
+                      new Bin(splits(featureIndex)(index-1), splits(featureIndex)(index),
+                        Categorical, key)
+                    }
+                  }
+              }
             }
           }
           featureIndex += 1
@@ -1090,7 +1522,7 @@ object DecisionTree extends Serializable with Logging {
           if (isFeatureContinuous) { // Bins for categorical variables are already assigned.
             bins(featureIndex)(0) = new Bin(new DummyLowSplit(featureIndex, Continuous),
               splits(featureIndex)(0), Continuous, Double.MinValue)
-            for (index <- 1 until numBins - 1){
+            for (index <- 1 until numBins - 1) {
               val bin = new Bin(splits(featureIndex)(index-1), splits(featureIndex)(index),
                 Continuous, Double.MinValue)
               bins(featureIndex)(index) = bin
@@ -1107,4 +1539,29 @@ object DecisionTree extends Serializable with Logging {
         throw new UnsupportedOperationException("approximate histogram not supported yet.")
     }
   }
+
+  /**
+   * Nested method to extract list of eligible categories given an index. It extracts the
+   * position of ones in a binary representation of the input. If binary
+   * representation of an number is 01101 (13), the output list should (3.0, 2.0,
+   * 0.0). The maxFeatureValue depict the number of rightmost digits that will be tested for ones.
+   */
+  private[tree] def extractMultiClassCategories(
+      input: Int,
+      maxFeatureValue: Int): List[Double] = {
+    var categories = List[Double]()
+    var j = 0
+    var bitShiftedInput = input
+    while (j < maxFeatureValue) {
+      if (bitShiftedInput % 2 != 0) {
+        // updating the list of categories.
+        categories = j.toDouble :: categories
+      }
+      // Right shift by one
+      bitShiftedInput = bitShiftedInput >> 1
+      j += 1
+    }
+    categories
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index 1b505fd76eb75..4ee4bcd0bcbc7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.mllib.tree.configuration
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.tree.impurity.Impurity
 import org.apache.spark.mllib.tree.configuration.Algo._
@@ -27,7 +29,10 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
  * Stores all the configuration options for tree construction
  * @param algo classification or regression
  * @param impurity criterion used for information gain calculation
- * @param maxDepth maximum depth of the tree
+ * @param maxDepth Maximum depth of the tree.
+ *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+ * @param numClassesForClassification number of classes for classification. Default value is 2
+ *                                    leads to binary classification
  * @param maxBins maximum number of bins used for splitting features
  * @param quantileCalculationStrategy algorithm for calculating quantiles
  * @param categoricalFeaturesInfo A map storing information about the categorical variables and the
@@ -44,7 +49,45 @@ class Strategy (
     val algo: Algo,
     val impurity: Impurity,
     val maxDepth: Int,
+    val numClassesForClassification: Int = 2,
     val maxBins: Int = 100,
     val quantileCalculationStrategy: QuantileStrategy = Sort,
     val categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
-    val maxMemoryInMB: Int = 128) extends Serializable
+    val maxMemoryInMB: Int = 128) extends Serializable {
+
+  if (algo == Classification) {
+    require(numClassesForClassification >= 2)
+  }
+  val isMulticlassClassification =
+    algo == Classification && numClassesForClassification > 2
+  val isMulticlassWithCategoricalFeatures
+    = isMulticlassClassification && (categoricalFeaturesInfo.size > 0)
+
+  /**
+   * Java-friendly constructor.
+   *
+   * @param algo classification or regression
+   * @param impurity criterion used for information gain calculation
+   * @param maxDepth Maximum depth of the tree.
+   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+   * @param numClassesForClassification number of classes for classification. Default value is 2
+   *                                    leads to binary classification
+   * @param maxBins maximum number of bins used for splitting features
+   * @param categoricalFeaturesInfo A map storing information about the categorical variables and
+   *                                the number of discrete values they take. For example, an entry
+   *                                (n -> k) implies the feature n is categorical with k categories
+   *                                0, 1, 2, ... , k-1. It's important to note that features are
+   *                                zero-indexed.
+   */
+  def this(
+      algo: Algo,
+      impurity: Impurity,
+      maxDepth: Int,
+      numClassesForClassification: Int,
+      maxBins: Int,
+      categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer]) {
+    this(algo, impurity, maxDepth, numClassesForClassification, maxBins, Sort,
+      categoricalFeaturesInfo.asInstanceOf[java.util.Map[Int, Int]].asScala.toMap)
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
index 60f43e9278d2a..96d2471e1f88c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -31,23 +31,46 @@ object Entropy extends Impurity {
 
   /**
    * :: DeveloperApi ::
-   * entropy calculation
-   * @param c0 count of instances with label 0
-   * @param c1 count of instances with label 1
-   * @return entropy value
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
-  override def calculate(c0: Double, c1: Double): Double = {
-    if (c0 == 0 || c1 == 0) {
-      0
-    } else {
-      val total = c0 + c1
-      val f0 = c0 / total
-      val f1 = c1 / total
-      -(f0 * log2(f0)) - (f1 * log2(f1))
+  override def calculate(counts: Array[Double], totalCount: Double): Double = {
+    if (totalCount == 0) {
+      return 0
     }
+    val numClasses = counts.length
+    var impurity = 0.0
+    var classIndex = 0
+    while (classIndex < numClasses) {
+      val classCount = counts(classIndex)
+      if (classCount != 0) {
+        val freq = classCount / totalCount
+        impurity -= freq * log2(freq)
+      }
+      classIndex += 1
+    }
+    impurity
   }
 
+  /**
+   * :: DeveloperApi ::
+   * variance calculation
+   * @param count number of instances
+   * @param sum sum of labels
+   * @param sumSquares summation of squares of the labels
+   * @return information value, or 0 if count = 0
+   */
+  @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
     throw new UnsupportedOperationException("Entropy.calculate")
+
+  /**
+   * Get this impurity instance.
+   * This is useful for passing impurity parameters to a Strategy in Java.
+   */
+  def instance = this
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
index c51d76d9b4c5b..d586f449048bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
@@ -30,23 +30,43 @@ object Gini extends Impurity {
 
   /**
    * :: DeveloperApi ::
-   * Gini coefficient calculation
-   * @param c0 count of instances with label 0
-   * @param c1 count of instances with label 1
-   * @return Gini coefficient value
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
-  override def calculate(c0: Double, c1: Double): Double = {
-    if (c0 == 0 || c1 == 0) {
-      0
-    } else {
-      val total = c0 + c1
-      val f0 = c0 / total
-      val f1 = c1 / total
-      1 - f0 * f0 - f1 * f1
+  override def calculate(counts: Array[Double], totalCount: Double): Double = {
+    if (totalCount == 0) {
+      return 0
     }
+    val numClasses = counts.length
+    var impurity = 1.0
+    var classIndex = 0
+    while (classIndex < numClasses) {
+      val freq = counts(classIndex) / totalCount
+      impurity -= freq * freq
+      classIndex += 1
+    }
+    impurity
   }
 
+  /**
+   * :: DeveloperApi ::
+   * variance calculation
+   * @param count number of instances
+   * @param sum sum of labels
+   * @param sumSquares summation of squares of the labels
+   * @return information value, or 0 if count = 0
+   */
+  @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
     throw new UnsupportedOperationException("Gini.calculate")
+
+  /**
+   * Get this impurity instance.
+   * This is useful for passing impurity parameters to a Strategy in Java.
+   */
+  def instance = this
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index 8eab247cf0932..92b0c7b4a6fbc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -28,13 +28,13 @@ trait Impurity extends Serializable {
 
   /**
    * :: DeveloperApi ::
-   * information calculation for binary classification
-   * @param c0 count of instances with label 0
-   * @param c1 count of instances with label 1
-   * @return information value
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value, or 0 if totalCount = 0
    */
   @DeveloperApi
-  def calculate(c0 : Double, c1 : Double): Double
+  def calculate(counts: Array[Double], totalCount: Double): Double
 
   /**
    * :: DeveloperApi ::
@@ -42,7 +42,7 @@ trait Impurity extends Serializable {
    * @param count number of instances
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
-   * @return information value
+   * @return information value, or 0 if count = 0
    */
   @DeveloperApi
   def calculate(count: Double, sum: Double, sumSquares: Double): Double
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
index 47d07122af30f..f7d99a40eb380 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
@@ -25,7 +25,16 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
  */
 @Experimental
 object Variance extends Impurity {
-   override def calculate(c0: Double, c1: Double): Double =
+
+  /**
+   * :: DeveloperApi ::
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value, or 0 if totalCount = 0
+   */
+  @DeveloperApi
+  override def calculate(counts: Array[Double], totalCount: Double): Double =
      throw new UnsupportedOperationException("Variance.calculate")
 
   /**
@@ -34,10 +43,21 @@ object Variance extends Impurity {
    * @param count number of instances
    * @param sum sum of labels
    * @param sumSquares summation of squares of the labels
+   * @return information value, or 0 if count = 0
    */
   @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double = {
+    if (count == 0) {
+      return 0
+    }
     val squaredLoss = sumSquares - (sum * sum) / count
     squaredLoss / count
   }
+
+  /**
+   * Get this impurity instance.
+   * This is useful for passing impurity parameters to a Strategy in Java.
+   */
+  def instance = this
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Bin.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Bin.scala
index 2d71e1e366069..c89c1e371a40e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Bin.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Bin.scala
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType._
  * @param highSplit signifying the upper threshold for the continuous feature to be
  *                 accepted in the bin
  * @param featureType type of feature -- categorical or continuous
- * @param category categorical label value accepted in the bin
+ * @param category categorical label value accepted in the bin for binary classification
  */
 private[tree]
 case class Bin(lowSplit: Split, highSplit: Split, featureType: FeatureType, category: Double)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index bf692ca8c4bd7..3d3406b5d5f22 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -24,7 +24,8 @@ import org.apache.spark.mllib.linalg.Vector
 
 /**
  * :: Experimental ::
- * Model to store the decision tree parameters
+ * Decision tree model for classification or regression.
+ * This model stores the decision tree structure and parameters.
  * @param topNode root node
  * @param algo algorithm type -- classification or regression
  */
@@ -50,4 +51,32 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
   def predict(features: RDD[Vector]): RDD[Double] = {
     features.map(x => predict(x))
   }
+
+  /**
+   * Get number of nodes in tree, including leaf nodes.
+   */
+  def numNodes: Int = {
+    1 + topNode.numDescendants
+  }
+
+  /**
+   * Get depth of tree.
+   * E.g.: Depth 0 means 1 leaf node.  Depth 1 means 1 internal node and 2 leaf nodes.
+   */
+  def depth: Int = {
+    topNode.subtreeDepth
+  }
+
+  /**
+   * Print full model.
+   */
+  override def toString: String = algo match {
+    case Classification =>
+      s"DecisionTreeModel classifier\n" + topNode.subtreeToString(2)
+    case Regression =>
+      s"DecisionTreeModel regressor\n" + topNode.subtreeToString(2)
+    case _ => throw new IllegalArgumentException(
+      s"DecisionTreeModel given unknown algo parameter: $algo.")
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
index cc8a24cce9614..fb12298e0f5d3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -27,6 +27,7 @@ import org.apache.spark.annotation.DeveloperApi
  * @param leftImpurity left node impurity
  * @param rightImpurity right node impurity
  * @param predict predicted value
+ * @param prob probability of the label (classification only)
  */
 @DeveloperApi
 class InformationGainStats(
@@ -34,10 +35,11 @@ class InformationGainStats(
     val impurity: Double,
     val leftImpurity: Double,
     val rightImpurity: Double,
-    val predict: Double) extends Serializable {
+    val predict: Double,
+    val prob: Double = 0.0) extends Serializable {
 
   override def toString = {
-    "gain = %f, impurity = %f, left impurity = %f, right impurity = %f, predict = %f"
-      .format(gain, impurity, leftImpurity, rightImpurity, predict)
+    "gain = %f, impurity = %f, left impurity = %f, right impurity = %f, predict = %f, prob = %f"
+      .format(gain, impurity, leftImpurity, rightImpurity, predict, prob)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index 682f213f411a7..944f11c2c2e4f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -91,4 +91,60 @@ class Node (
       }
     }
   }
+
+  /**
+   * Get the number of nodes in tree below this node, including leaf nodes.
+   * E.g., if this is a leaf, returns 0.  If both children are leaves, returns 2.
+   */
+  private[tree] def numDescendants: Int = {
+    if (isLeaf) {
+      0
+    } else {
+      2 + leftNode.get.numDescendants + rightNode.get.numDescendants
+    }
+  }
+
+  /**
+   * Get depth of tree from this node.
+   * E.g.: Depth 0 means this is a leaf node.
+   */
+  private[tree] def subtreeDepth: Int = {
+    if (isLeaf) {
+      0
+    } else {
+      1 + math.max(leftNode.get.subtreeDepth, rightNode.get.subtreeDepth)
+    }
+  }
+
+  /**
+   * Recursive print function.
+   * @param indentFactor  The number of spaces to add to each level of indentation.
+   */
+  private[tree] def subtreeToString(indentFactor: Int = 0): String = {
+
+    def splitToString(split: Split, left: Boolean): String = {
+      split.featureType match {
+        case Continuous => if (left) {
+          s"(feature ${split.feature} <= ${split.threshold})"
+        } else {
+          s"(feature ${split.feature} > ${split.threshold})"
+        }
+        case Categorical => if (left) {
+          s"(feature ${split.feature} in ${split.categories.mkString("{",",","}")})"
+        } else {
+          s"(feature ${split.feature} not in ${split.categories.mkString("{",",","}")})"
+        }
+      }
+    }
+    val prefix: String = " " * indentFactor
+    if (isLeaf) {
+      prefix + s"Predict: $predict\n"
+    } else {
+      prefix + s"If ${splitToString(split.get, left=true)}\n" +
+        leftNode.get.subtreeToString(indentFactor + 1) +
+        prefix + s"Else ${splitToString(split.get, left=false)}\n" +
+        rightNode.get.subtreeToString(indentFactor + 1)
+    }
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
deleted file mode 100644
index e25bf18b780bf..0000000000000
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LabelParsers.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.util
-
-/** Trait for label parsers. */
-private trait LabelParser extends Serializable {
-  /** Parses a string label into a double label. */
-  def parse(labelString: String): Double
-}
-
-/** Factory methods for label parsers. */
-private object LabelParser {
-  def getInstance(multiclass: Boolean): LabelParser = {
-    if (multiclass) MulticlassLabelParser else BinaryLabelParser
-  }
-}
-
-/**
- * Label parser for binary labels, which outputs 1.0 (positive) if the value is greater than 0.5,
- * or 0.0 (negative) otherwise. So it works with +1/-1 labeling and +1/0 labeling.
- */
-private object BinaryLabelParser extends LabelParser {
-  /** Gets the default instance of BinaryLabelParser. */
-  def getInstance(): LabelParser = this
-
-  /**
-   * Parses the input label into positive (1.0) if the value is greater than 0.5,
-   * or negative (0.0) otherwise.
-   */
-  override def parse(labelString: String): Double = if (labelString.toDouble > 0.5) 1.0 else 0.0
-}
-
-/**
- * Label parser for multiclass labels, which converts the input label to double.
- */
-private object MulticlassLabelParser extends LabelParser {
-  /** Gets the default instance of MulticlassLabelParser. */
-  def getInstance(): LabelParser = this
-
-  override def parse(labelString: String): Double =  labelString.toDouble
-}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index aaf92a1a8869a..f4cce86a65ba7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -30,6 +30,8 @@ import org.apache.spark.util.random.BernoulliSampler
 import org.apache.spark.mllib.regression.{LabeledPointParser, LabeledPoint}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.dstream.DStream
 
 /**
  * Helper methods to load, save and pre-process data used in ML Lib.
@@ -55,7 +57,6 @@ object MLUtils {
    *
    * @param sc Spark context
    * @param path file or directory path in any Hadoop-supported file system URI
-   * @param labelParser parser for labels
    * @param numFeatures number of features, which will be determined from the input data if a
    *                    nonpositive value is given. This is useful when the dataset is already split
    *                    into multiple files and you want to load them separately, because some
@@ -64,10 +65,9 @@ object MLUtils {
    * @param minPartitions min number of partitions
    * @return labeled data stored as an RDD[LabeledPoint]
    */
-  private def loadLibSVMFile(
+  def loadLibSVMFile(
       sc: SparkContext,
       path: String,
-      labelParser: LabelParser,
       numFeatures: Int,
       minPartitions: Int): RDD[LabeledPoint] = {
     val parsed = sc.textFile(path, minPartitions)
@@ -75,7 +75,7 @@ object MLUtils {
       .filter(line => !(line.isEmpty || line.startsWith("#")))
       .map { line =>
         val items = line.split(' ')
-        val label = labelParser.parse(items.head)
+        val label = items.head.toDouble
         val (indices, values) = items.tail.map { item =>
           val indexAndValue = item.split(':')
           val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
@@ -102,64 +102,46 @@ object MLUtils {
 
   // Convenient methods for `loadLibSVMFile`.
 
-  /**
-   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
-   * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
-   * Each line represents a labeled sparse feature vector using the following format:
-   * {{{label index1:value1 index2:value2 ...}}}
-   * where the indices are one-based and in ascending order.
-   * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
-   * where the feature indices are converted to zero-based.
-   *
-   * @param sc Spark context
-   * @param path file or directory path in any Hadoop-supported file system URI
-   * @param multiclass whether the input labels contain more than two classes. If false, any label
-   *                   with value greater than 0.5 will be mapped to 1.0, or 0.0 otherwise. So it
-   *                   works for both +1/-1 and 1/0 cases. If true, the double value parsed directly
-   *                   from the label string will be used as the label value.
-   * @param numFeatures number of features, which will be determined from the input data if a
-   *                    nonpositive value is given. This is useful when the dataset is already split
-   *                    into multiple files and you want to load them separately, because some
-   *                    features may not present in certain files, which leads to inconsistent
-   *                    feature dimensions.
-   * @param minPartitions min number of partitions
-   * @return labeled data stored as an RDD[LabeledPoint]
-   */
-   def loadLibSVMFile(
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
+  def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       multiclass: Boolean,
       numFeatures: Int,
       minPartitions: Int): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, LabelParser.getInstance(multiclass), numFeatures, minPartitions)
+    loadLibSVMFile(sc, path, numFeatures, minPartitions)
 
   /**
    * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
    * partitions.
    */
+  def loadLibSVMFile(
+      sc: SparkContext,
+      path: String,
+      numFeatures: Int): RDD[LabeledPoint] =
+    loadLibSVMFile(sc, path, numFeatures, sc.defaultMinPartitions)
+
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       multiclass: Boolean,
       numFeatures: Int): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass, numFeatures, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path, numFeatures)
 
-  /**
-   * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the number of features
-   * determined automatically and the default number of partitions.
-   */
+  @deprecated("use method without multiclass argument, which no longer has effect", "1.1.0")
   def loadLibSVMFile(
       sc: SparkContext,
       path: String,
       multiclass: Boolean): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass, -1, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path)
 
   /**
    * Loads binary labeled data in the LIBSVM format into an RDD[LabeledPoint], with number of
    * features determined automatically and the default number of partitions.
    */
   def loadLibSVMFile(sc: SparkContext, path: String): RDD[LabeledPoint] =
-    loadLibSVMFile(sc, path, multiclass = false, -1, sc.defaultMinPartitions)
+    loadLibSVMFile(sc, path, -1)
 
   /**
    * Save labeled data in LIBSVM format.
@@ -212,6 +194,19 @@ object MLUtils {
   def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
     loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
 
+  /**
+   * Loads streaming labeled points from a stream of text files
+   * where points are in the same format as used in `RDD[LabeledPoint].saveAsTextFile`.
+   * See `StreamingContext.textFileStream` for more details on how to
+   * generate a stream from files
+   *
+   * @param ssc Streaming context
+   * @param dir Directory path in any Hadoop-supported file system URI
+   * @return Labeled points stored as a DStream[LabeledPoint]
+   */
+  def loadStreamingLabeledPoints(ssc: StreamingContext, dir: String): DStream[LabeledPoint] =
+    ssc.textFileStream(dir).map(LabeledPointParser.parse)
+
   /**
    * Load labeled data from a file. The data format used here is
    * <L>, <f1> <f2> ...
@@ -264,8 +259,8 @@ object MLUtils {
     (1 to numFolds).map { fold =>
       val sampler = new BernoulliSampler[T]((fold - 1) / numFoldsF, fold / numFoldsF,
         complement = false)
-      val validation = new PartitionwiseSampledRDD(rdd, sampler, seed)
-      val training = new PartitionwiseSampledRDD(rdd, sampler.cloneComplement(), seed)
+      val validation = new PartitionwiseSampledRDD(rdd, sampler, true, seed)
+      val training = new PartitionwiseSampledRDD(rdd, sampler.cloneComplement(), true, seed)
       (training, validation)
     }.toArray
   }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaLogisticRegressionSuite.java
index faa675b59cd50..862221d48798a 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaLogisticRegressionSuite.java
@@ -92,8 +92,6 @@ public void runLRUsingStaticMethods() {
         testRDD.rdd(), 100, 1.0, 1.0);
 
     int numAccurate = validatePrediction(validationData, model);
-      System.out.println(numAccurate);
     Assert.assertTrue(numAccurate > nPoints * 4.0 / 5.0);
   }
-
 }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
new file mode 100644
index 0000000000000..e8d99f4ae43ae
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+
+public class JavaTfIdfSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaTfIdfSuite");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void tfIdf() {
+    // The tests are to check Java compatibility.
+    HashingTF tf = new HashingTF();
+    JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
+      Lists.newArrayList("this is a sentence".split(" ")),
+      Lists.newArrayList("this is another sentence".split(" ")),
+      Lists.newArrayList("this is still a sentence".split(" "))), 2);
+    JavaRDD<Vector> termFreqs = tf.transform(documents);
+    termFreqs.collect();
+    IDF idf = new IDF();
+    JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
+    List<Vector> localTfIdfs = tfIdfs.collect();
+    int indexOfThis = tf.indexOf("this");
+    for (Vector v: localTfIdfs) {
+      Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
+    }
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java b/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java
index bf2365f82044c..f6ca9643227f8 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/recommendation/JavaALSSuite.java
@@ -20,6 +20,11 @@
 import java.io.Serializable;
 import java.util.List;
 
+import scala.Tuple2;
+import scala.Tuple3;
+
+import org.jblas.DoubleMatrix;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -28,8 +33,6 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 
-import org.jblas.DoubleMatrix;
-
 public class JavaALSSuite implements Serializable {
   private transient JavaSparkContext sc;
 
@@ -44,21 +47,28 @@ public void tearDown() {
     sc = null;
   }
 
-  static void validatePrediction(MatrixFactorizationModel model, int users, int products, int features,
-      DoubleMatrix trueRatings, double matchThreshold, boolean implicitPrefs, DoubleMatrix truePrefs) {
+  static void validatePrediction(
+      MatrixFactorizationModel model,
+      int users,
+      int products,
+      int features,
+      DoubleMatrix trueRatings,
+      double matchThreshold,
+      boolean implicitPrefs,
+      DoubleMatrix truePrefs) {
     DoubleMatrix predictedU = new DoubleMatrix(users, features);
-    List<scala.Tuple2<Object, double[]>> userFeatures = model.userFeatures().toJavaRDD().collect();
+    List<Tuple2<Object, double[]>> userFeatures = model.userFeatures().toJavaRDD().collect();
     for (int i = 0; i < features; ++i) {
-      for (scala.Tuple2<Object, double[]> userFeature : userFeatures) {
+      for (Tuple2<Object, double[]> userFeature : userFeatures) {
         predictedU.put((Integer)userFeature._1(), i, userFeature._2()[i]);
       }
     }
     DoubleMatrix predictedP = new DoubleMatrix(products, features);
 
-    List<scala.Tuple2<Object, double[]>> productFeatures =
+    List<Tuple2<Object, double[]>> productFeatures =
       model.productFeatures().toJavaRDD().collect();
     for (int i = 0; i < features; ++i) {
-      for (scala.Tuple2<Object, double[]> productFeature : productFeatures) {
+      for (Tuple2<Object, double[]> productFeature : productFeatures) {
         predictedP.put((Integer)productFeature._1(), i, productFeature._2()[i]);
       }
     }
@@ -75,7 +85,8 @@ static void validatePrediction(MatrixFactorizationModel model, int users, int pr
         }
       }
     } else {
-      // For implicit prefs we use the confidence-weighted RMSE to test (ref Mahout's implicit ALS tests)
+      // For implicit prefs we use the confidence-weighted RMSE to test
+      // (ref Mahout's implicit ALS tests)
       double sqErr = 0.0;
       double denom = 0.0;
       for (int u = 0; u < users; ++u) {
@@ -100,7 +111,7 @@ public void runALSUsingStaticMethods() {
     int iterations = 15;
     int users = 50;
     int products = 100;
-    scala.Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
+    Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
         users, products, features, 0.7, false, false);
 
     JavaRDD<Rating> data = sc.parallelize(testData._1());
@@ -114,14 +125,14 @@ public void runALSUsingConstructor() {
     int iterations = 15;
     int users = 100;
     int products = 200;
-    scala.Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
+    Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
         users, products, features, 0.7, false, false);
 
     JavaRDD<Rating> data = sc.parallelize(testData._1());
 
     MatrixFactorizationModel model = new ALS().setRank(features)
-                                              .setIterations(iterations)
-                                              .run(data.rdd());
+      .setIterations(iterations)
+      .run(data.rdd());
     validatePrediction(model, users, products, features, testData._2(), 0.3, false, testData._3());
   }
 
@@ -131,7 +142,7 @@ public void runImplicitALSUsingStaticMethods() {
     int iterations = 15;
     int users = 80;
     int products = 160;
-    scala.Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
+    Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
         users, products, features, 0.7, true, false);
 
     JavaRDD<Rating> data = sc.parallelize(testData._1());
@@ -145,7 +156,7 @@ public void runImplicitALSUsingConstructor() {
     int iterations = 15;
     int users = 100;
     int products = 200;
-    scala.Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
+    Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
         users, products, features, 0.7, true, false);
 
     JavaRDD<Rating> data = sc.parallelize(testData._1());
@@ -163,12 +174,42 @@ public void runImplicitALSWithNegativeWeight() {
     int iterations = 15;
     int users = 80;
     int products = 160;
-    scala.Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
+    Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
         users, products, features, 0.7, true, true);
 
     JavaRDD<Rating> data = sc.parallelize(testData._1());
-    MatrixFactorizationModel model = ALS.trainImplicit(data.rdd(), features, iterations);
+    MatrixFactorizationModel model = new ALS().setRank(features)
+      .setIterations(iterations)
+      .setImplicitPrefs(true)
+      .setSeed(8675309L)
+      .run(data.rdd());
     validatePrediction(model, users, products, features, testData._2(), 0.4, true, testData._3());
   }
 
+  @Test
+  public void runRecommend() {
+    int features = 5;
+    int iterations = 10;
+    int users = 200;
+    int products = 50;
+    Tuple3<List<Rating>, DoubleMatrix, DoubleMatrix> testData = ALSSuite.generateRatingsAsJavaList(
+        users, products, features, 0.7, true, false);
+    JavaRDD<Rating> data = sc.parallelize(testData._1());
+    MatrixFactorizationModel model = new ALS().setRank(features)
+      .setIterations(iterations)
+      .setImplicitPrefs(true)
+      .setSeed(8675309L)
+      .run(data.rdd());
+    validateRecommendations(model.recommendProducts(1, 10), 10);
+    validateRecommendations(model.recommendUsers(1, 20), 20);
+  }
+
+  private static void validateRecommendations(Rating[] recommendations, int howMany) {
+    Assert.assertEquals(howMany, recommendations.length);
+    for (int i = 1; i < recommendations.length; i++) {
+      Assert.assertTrue(recommendations[i-1].rating() >= recommendations[i].rating());
+    }
+    Assert.assertTrue(recommendations[0].rating() > 0.7);
+  }
+
 }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java b/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java
new file mode 100644
index 0000000000000..2c281a1ee7157
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.tree;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.tree.configuration.Algo;
+import org.apache.spark.mllib.tree.configuration.Strategy;
+import org.apache.spark.mllib.tree.impurity.Gini;
+import org.apache.spark.mllib.tree.model.DecisionTreeModel;
+
+
+public class JavaDecisionTreeSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaDecisionTreeSuite");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  int validatePrediction(List<LabeledPoint> validationData, DecisionTreeModel model) {
+    int numCorrect = 0;
+    for (LabeledPoint point: validationData) {
+      Double prediction = model.predict(point.features());
+      if (prediction == point.label()) {
+        numCorrect++;
+      }
+    }
+    return numCorrect;
+  }
+
+  @Test
+  public void runDTUsingConstructor() {
+    List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList();
+    JavaRDD<LabeledPoint> rdd = sc.parallelize(arr);
+    HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
+    categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories
+
+    int maxDepth = 4;
+    int numClasses = 2;
+    int maxBins = 100;
+    Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses,
+        maxBins, categoricalFeaturesInfo);
+
+    DecisionTree learner = new DecisionTree(strategy);
+    DecisionTreeModel model = learner.train(rdd.rdd());
+
+    int numCorrect = validatePrediction(arr, model);
+    Assert.assertTrue(numCorrect == rdd.count());
+  }
+
+  @Test
+  public void runDTUsingStaticMethods() {
+    List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList();
+    JavaRDD<LabeledPoint> rdd = sc.parallelize(arr);
+    HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
+    categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories
+
+    int maxDepth = 4;
+    int numClasses = 2;
+    int maxBins = 100;
+    Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses,
+        maxBins, categoricalFeaturesInfo);
+
+    DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy);
+
+    int numCorrect = validatePrediction(arr, model);
+    Assert.assertTrue(numCorrect == rdd.count());
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
index 642843f90204c..bd413a80f5107 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.api.python
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Matrices, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 
 class PythonMLLibAPISuite extends FunSuite {
@@ -57,4 +57,27 @@ class PythonMLLibAPISuite extends FunSuite {
       assert(q.features === p.features)
     }
   }
+
+  test("double serialization") {
+    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
+      val bytes = py.serializeDouble(x)
+      val deser = py.deserializeDouble(bytes)
+      // We use `equals` here for comparison because we cannot use `==` for NaN
+      assert(x.equals(deser))
+    }
+  }
+
+  test("matrix to 2D array") {
+    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
+    val matrix = Matrices.dense(2, 3, values)
+    val arr = py.to2dArray(matrix)
+    val expected = Array(Array[Double](0, 3, 7), Array[Double](1.2, 4.56, 8))
+    assert(arr === expected)
+
+    // Test conversion for empty matrix
+    val empty = Array[Double]()
+    val emptyMatrix = Matrices.dense(0, 0, empty)
+    val empty2D = py.to2dArray(emptyMatrix)
+    assert(empty2D === Array[Array[Double]]())
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 44b757b6a1fb7..da7c633bbd2af 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -25,7 +25,8 @@ import org.scalatest.Matchers
 
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression._
-import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
 
 object LogisticRegressionSuite {
 
@@ -81,9 +82,8 @@ class LogisticRegressionSuite extends FunSuite with LocalSparkContext with Match
     val model = lr.run(testRDD)
 
     // Test the weights
-    val weight0 = model.weights(0)
-    assert(weight0 >= -1.60 && weight0 <= -1.40, weight0 + " not in [-1.6, -1.4]")
-    assert(model.intercept >= 1.9 && model.intercept <= 2.1, model.intercept + " not in [1.9, 2.1]")
+    assert(model.weights(0) ~== -1.52 relTol 0.01)
+    assert(model.intercept ~== 2.00 relTol 0.01)
 
     val validationData = LogisticRegressionSuite.generateLogisticInput(A, B, nPoints, 17)
     val validationRDD = sc.parallelize(validationData, 2)
@@ -113,9 +113,9 @@ class LogisticRegressionSuite extends FunSuite with LocalSparkContext with Match
 
     val model = lr.run(testRDD, initialWeights)
 
-    val weight0 = model.weights(0)
-    assert(weight0 >= -1.60 && weight0 <= -1.40, weight0 + " not in [-1.6, -1.4]")
-    assert(model.intercept >= 1.9 && model.intercept <= 2.1, model.intercept + " not in [1.9, 2.1]")
+    // Test the weights
+    assert(model.weights(0) ~== -1.50 relTol 0.01)
+    assert(model.intercept ~== 1.97 relTol 0.01)
 
     val validationData = LogisticRegressionSuite.generateLogisticInput(A, B, nPoints, 17)
     val validationRDD = sc.parallelize(validationData, 2)
@@ -126,3 +126,19 @@ class LogisticRegressionSuite extends FunSuite with LocalSparkContext with Match
     validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
   }
 }
+
+class LogisticRegressionClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
+    }.cache()
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val model = LogisticRegressionWithSGD.train(points, 2)
+    val predictions = model.predict(points.map(_.features))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index 516895d04222d..06cdd04f5fdae 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -23,7 +23,7 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
 
 object NaiveBayesSuite {
 
@@ -96,3 +96,21 @@ class NaiveBayesSuite extends FunSuite with LocalSparkContext {
     validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
   }
 }
+
+class NaiveBayesClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 10
+    val n = 200000
+    val examples = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map { i =>
+        LabeledPoint(random.nextInt(2), Vectors.dense(Array.fill(n)(random.nextDouble())))
+      }
+    }
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val model = NaiveBayes.train(examples)
+    val predictions = model.predict(examples.map(_.features))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
index 886c71dde3af7..65e5df58db4c7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
@@ -17,17 +17,16 @@
 
 package org.apache.spark.mllib.classification
 
-import scala.util.Random
 import scala.collection.JavaConversions._
-
-import org.scalatest.FunSuite
+import scala.util.Random
 
 import org.jblas.DoubleMatrix
+import org.scalatest.FunSuite
 
 import org.apache.spark.SparkException
-import org.apache.spark.mllib.regression._
-import org.apache.spark.mllib.util.LocalSparkContext
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
 
 object SVMSuite {
 
@@ -193,3 +192,19 @@ class SVMSuite extends FunSuite with LocalSparkContext {
     new SVMWithSGD().setValidateData(false).run(testRDDInvalid)
   }
 }
+
+class SVMClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
+    }.cache()
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val model = SVMWithSGD.train(points, 2)
+    val predictions = model.predict(points.map(_.features))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 560a4ad71a4de..afa1f79b95a12 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -17,14 +17,17 @@
 
 package org.apache.spark.mllib.clustering
 
+import scala.util.Random
+
 import org.scalatest.FunSuite
 
-import org.apache.spark.mllib.util.LocalSparkContext
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
 
 class KMeansSuite extends FunSuite with LocalSparkContext {
 
-  import KMeans.{RANDOM, K_MEANS_PARALLEL}
+  import org.apache.spark.mllib.clustering.KMeans.{K_MEANS_PARALLEL, RANDOM}
 
   test("single cluster") {
     val data = sc.parallelize(Array(
@@ -38,27 +41,53 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
     // No matter how many runs or iterations we use, we should get one cluster,
     // centered at the mean of the points
 
-    var model = KMeans.train(data, k=1, maxIterations=1)
-    assert(model.clusterCenters.head === center)
+    var model = KMeans.train(data, k = 1, maxIterations = 1)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=2)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 2)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=RANDOM)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1, initializationMode = RANDOM)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
     model = KMeans.train(
-      data, k=1, maxIterations=1, runs=1, initializationMode=K_MEANS_PARALLEL)
-    assert(model.clusterCenters.head === center)
+      data, k = 1, maxIterations = 1, runs = 1, initializationMode = K_MEANS_PARALLEL)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
+  }
+
+  test("no distinct points") {
+    val data = sc.parallelize(
+      Array(
+        Vectors.dense(1.0, 2.0, 3.0),
+        Vectors.dense(1.0, 2.0, 3.0),
+        Vectors.dense(1.0, 2.0, 3.0)),
+      2)
+    val center = Vectors.dense(1.0, 2.0, 3.0)
+
+    // Make sure code runs.
+    var model = KMeans.train(data, k=2, maxIterations=1)
+    assert(model.clusterCenters.size === 2)
+  }
+
+  test("more clusters than points") {
+    val data = sc.parallelize(
+      Array(
+        Vectors.dense(1.0, 2.0, 3.0),
+        Vectors.dense(1.0, 3.0, 4.0)),
+      2)
+
+    // Make sure code runs.
+    var model = KMeans.train(data, k=3, maxIterations=1)
+    assert(model.clusterCenters.size === 3)
   }
 
   test("single cluster with big dataset") {
@@ -74,27 +103,28 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
 
     val center = Vectors.dense(1.0, 3.0, 4.0)
 
-    var model = KMeans.train(data, k=1, maxIterations=1)
+    var model = KMeans.train(data, k = 1, maxIterations = 1)
     assert(model.clusterCenters.size === 1)
-    assert(model.clusterCenters.head === center)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=2)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 2)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=RANDOM)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1, initializationMode = RANDOM)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=K_MEANS_PARALLEL)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1,
+      initializationMode = K_MEANS_PARALLEL)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
   }
 
   test("single cluster with sparse data") {
@@ -119,31 +149,40 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
 
     val center = Vectors.sparse(n, Seq((0, 1.0), (1, 3.0), (2, 4.0)))
 
-    var model = KMeans.train(data, k=1, maxIterations=1)
-    assert(model.clusterCenters.head === center)
+    var model = KMeans.train(data, k = 1, maxIterations = 1)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=2)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 2)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=5)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 5)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=RANDOM)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1, initializationMode = RANDOM)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
-    model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=K_MEANS_PARALLEL)
-    assert(model.clusterCenters.head === center)
+    model = KMeans.train(data, k = 1, maxIterations = 1, runs = 1,
+      initializationMode = K_MEANS_PARALLEL)
+    assert(model.clusterCenters.head ~== center absTol 1E-5)
 
     data.unpersist()
   }
 
   test("k-means|| initialization") {
+
+    case class VectorWithCompare(x: Vector) extends Ordered[VectorWithCompare] {
+      @Override def compare(that: VectorWithCompare): Int = {
+        if(this.x.toArray.foldLeft[Double](0.0)((acc, x) => acc + x * x) >
+          that.x.toArray.foldLeft[Double](0.0)((acc, x) => acc + x * x)) -1 else 1
+      }
+    }
+
     val points = Seq(
       Vectors.dense(1.0, 2.0, 6.0),
       Vectors.dense(1.0, 3.0, 0.0),
@@ -157,16 +196,20 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
     // it will make at least five passes, and it will give non-zero probability to each
     // unselected point as long as it hasn't yet selected all of them
 
-    var model = KMeans.train(rdd, k=5, maxIterations=1)
-    assert(Set(model.clusterCenters: _*) === Set(points: _*))
+    var model = KMeans.train(rdd, k = 5, maxIterations = 1)
+
+    assert(model.clusterCenters.sortBy(VectorWithCompare(_))
+      .zip(points.sortBy(VectorWithCompare(_))).forall(x => x._1 ~== (x._2) absTol 1E-5))
 
     // Iterations of Lloyd's should not change the answer either
-    model = KMeans.train(rdd, k=5, maxIterations=10)
-    assert(Set(model.clusterCenters: _*) === Set(points: _*))
+    model = KMeans.train(rdd, k = 5, maxIterations = 10)
+    assert(model.clusterCenters.sortBy(VectorWithCompare(_))
+      .zip(points.sortBy(VectorWithCompare(_))).forall(x => x._1 ~== (x._2) absTol 1E-5))
 
     // Neither should more runs
-    model = KMeans.train(rdd, k=5, maxIterations=10, runs=5)
-    assert(Set(model.clusterCenters: _*) === Set(points: _*))
+    model = KMeans.train(rdd, k = 5, maxIterations = 10, runs = 5)
+    assert(model.clusterCenters.sortBy(VectorWithCompare(_))
+      .zip(points.sortBy(VectorWithCompare(_))).forall(x => x._1 ~== (x._2) absTol 1E-5))
   }
 
   test("two clusters") {
@@ -194,3 +237,22 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
     }
   }
 }
+
+class KMeansClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => Vectors.dense(Array.fill(n)(random.nextDouble)))
+    }.cache()
+    for (initMode <- Seq(KMeans.RANDOM, KMeans.K_MEANS_PARALLEL)) {
+      // If we serialize data directly in the task closure, the size of the serialized task would be
+      // greater than 1MB and hence Spark would throw an error.
+      val model = KMeans.train(points, 2, 2, 1, initMode)
+      val predictions = model.predict(points).collect()
+      val cost = model.computeCost(points)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala
index 1c9844f289fe0..994e0feb8629e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/AreaUnderCurveSuite.scala
@@ -20,27 +20,28 @@ package org.apache.spark.mllib.evaluation
 import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
 
 class AreaUnderCurveSuite extends FunSuite with LocalSparkContext {
   test("auc computation") {
     val curve = Seq((0.0, 0.0), (1.0, 1.0), (2.0, 3.0), (3.0, 0.0))
     val auc = 4.0
-    assert(AreaUnderCurve.of(curve) === auc)
+    assert(AreaUnderCurve.of(curve) ~== auc absTol 1E-5)
     val rddCurve = sc.parallelize(curve, 2)
-    assert(AreaUnderCurve.of(rddCurve) == auc)
+    assert(AreaUnderCurve.of(rddCurve) ~== auc absTol 1E-5)
   }
 
   test("auc of an empty curve") {
     val curve = Seq.empty[(Double, Double)]
-    assert(AreaUnderCurve.of(curve) === 0.0)
+    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
     val rddCurve = sc.parallelize(curve, 2)
-    assert(AreaUnderCurve.of(rddCurve) === 0.0)
+    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
   }
 
   test("auc of a curve with a single point") {
     val curve = Seq((1.0, 1.0))
-    assert(AreaUnderCurve.of(curve) === 0.0)
+    assert(AreaUnderCurve.of(curve) ~== 0.0 absTol 1E-5)
     val rddCurve = sc.parallelize(curve, 2)
-    assert(AreaUnderCurve.of(rddCurve) === 0.0)
+    assert(AreaUnderCurve.of(rddCurve) ~== 0.0 absTol 1E-5)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
index 9d16182f9d8c4..a733f88b60b80 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
@@ -20,8 +20,15 @@ package org.apache.spark.mllib.evaluation
 import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
 
 class BinaryClassificationMetricsSuite extends FunSuite with LocalSparkContext {
+
+  def cond1(x: (Double, Double)): Boolean = x._1 ~= (x._2) absTol 1E-5
+
+  def cond2(x: ((Double, Double), (Double, Double))): Boolean =
+    (x._1._1 ~= x._2._1 absTol 1E-5) && (x._1._2 ~= x._2._2 absTol 1E-5)
+
   test("binary evaluation metrics") {
     val scoreAndLabels = sc.parallelize(
       Seq((0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)), 2)
@@ -39,16 +46,17 @@ class BinaryClassificationMetricsSuite extends FunSuite with LocalSparkContext {
     val rocCurve = Seq((0.0, 0.0)) ++ fpr.zip(recall) ++ Seq((1.0, 1.0))
     val pr = recall.zip(precision)
     val prCurve = Seq((0.0, 1.0)) ++ pr
-    val f1 = pr.map { case (r, p) => 2.0 * (p * r) / (p + r) }
+    val f1 = pr.map { case (r, p) => 2.0 * (p * r) / (p + r)}
     val f2 = pr.map { case (r, p) => 5.0 * (p * r) / (4.0 * p + r)}
-    assert(metrics.thresholds().collect().toSeq === threshold)
-    assert(metrics.roc().collect().toSeq === rocCurve)
-    assert(metrics.areaUnderROC() === AreaUnderCurve.of(rocCurve))
-    assert(metrics.pr().collect().toSeq === prCurve)
-    assert(metrics.areaUnderPR() === AreaUnderCurve.of(prCurve))
-    assert(metrics.fMeasureByThreshold().collect().toSeq === threshold.zip(f1))
-    assert(metrics.fMeasureByThreshold(2.0).collect().toSeq === threshold.zip(f2))
-    assert(metrics.precisionByThreshold().collect().toSeq === threshold.zip(precision))
-    assert(metrics.recallByThreshold().collect().toSeq === threshold.zip(recall))
+
+    assert(metrics.thresholds().collect().zip(threshold).forall(cond1))
+    assert(metrics.roc().collect().zip(rocCurve).forall(cond2))
+    assert(metrics.areaUnderROC() ~== AreaUnderCurve.of(rocCurve) absTol 1E-5)
+    assert(metrics.pr().collect().zip(prCurve).forall(cond2))
+    assert(metrics.areaUnderPR() ~== AreaUnderCurve.of(prCurve) absTol 1E-5)
+    assert(metrics.fMeasureByThreshold().collect().zip(threshold.zip(f1)).forall(cond2))
+    assert(metrics.fMeasureByThreshold(2.0).collect().zip(threshold.zip(f2)).forall(cond2))
+    assert(metrics.precisionByThreshold().collect().zip(threshold.zip(precision)).forall(cond2))
+    assert(metrics.recallByThreshold().collect().zip(threshold.zip(recall)).forall(cond2))
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
new file mode 100644
index 0000000000000..1ea503971c864
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.Matrices
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class MulticlassMetricsSuite extends FunSuite with LocalSparkContext {
+  test("Multiclass evaluation metrics") {
+    /*
+     * Confusion matrix for 3-class classification with total 9 instances:
+     * |2|1|1| true class0 (4 instances)
+     * |1|3|0| true class1 (4 instances)
+     * |0|0|1| true class2 (1 instance)
+     */
+    val confusionMatrix = Matrices.dense(3, 3, Array(2, 1, 0, 1, 3, 0, 1, 0, 1))
+    val labels = Array(0.0, 1.0, 2.0)
+    val predictionAndLabels = sc.parallelize(
+      Seq((0.0, 0.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0),
+        (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)), 2)
+    val metrics = new MulticlassMetrics(predictionAndLabels)
+    val delta = 0.0000001
+    val fpRate0 = 1.0 / (9 - 4)
+    val fpRate1 = 1.0 / (9 - 4)
+    val fpRate2 = 1.0 / (9 - 1)
+    val precision0 = 2.0 / (2 + 1)
+    val precision1 = 3.0 / (3 + 1)
+    val precision2 = 1.0 / (1 + 1)
+    val recall0 = 2.0 / (2 + 2)
+    val recall1 = 3.0 / (3 + 1)
+    val recall2 = 1.0 / (1 + 0)
+    val f1measure0 = 2 * precision0 * recall0 / (precision0 + recall0)
+    val f1measure1 = 2 * precision1 * recall1 / (precision1 + recall1)
+    val f1measure2 = 2 * precision2 * recall2 / (precision2 + recall2)
+    val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
+    val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
+    val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)
+
+    assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
+    assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta)
+    assert(math.abs(metrics.falsePositiveRate(1.0) - fpRate1) < delta)
+    assert(math.abs(metrics.falsePositiveRate(2.0) - fpRate2) < delta)
+    assert(math.abs(metrics.precision(0.0) - precision0) < delta)
+    assert(math.abs(metrics.precision(1.0) - precision1) < delta)
+    assert(math.abs(metrics.precision(2.0) - precision2) < delta)
+    assert(math.abs(metrics.recall(0.0) - recall0) < delta)
+    assert(math.abs(metrics.recall(1.0) - recall1) < delta)
+    assert(math.abs(metrics.recall(2.0) - recall2) < delta)
+    assert(math.abs(metrics.fMeasure(0.0) - f1measure0) < delta)
+    assert(math.abs(metrics.fMeasure(1.0) - f1measure1) < delta)
+    assert(math.abs(metrics.fMeasure(2.0) - f1measure2) < delta)
+    assert(math.abs(metrics.fMeasure(0.0, 2.0) - f2measure0) < delta)
+    assert(math.abs(metrics.fMeasure(1.0, 2.0) - f2measure1) < delta)
+    assert(math.abs(metrics.fMeasure(2.0, 2.0) - f2measure2) < delta)
+
+    assert(math.abs(metrics.recall -
+      (2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta)
+    assert(math.abs(metrics.recall - metrics.precision) < delta)
+    assert(math.abs(metrics.recall - metrics.fMeasure) < delta)
+    assert(math.abs(metrics.recall - metrics.weightedRecall) < delta)
+    assert(math.abs(metrics.weightedFalsePositiveRate -
+      ((4.0 / 9) * fpRate0 + (4.0 / 9) * fpRate1 + (1.0 / 9) * fpRate2)) < delta)
+    assert(math.abs(metrics.weightedPrecision -
+      ((4.0 / 9) * precision0 + (4.0 / 9) * precision1 + (1.0 / 9) * precision2)) < delta)
+    assert(math.abs(metrics.weightedRecall -
+      ((4.0 / 9) * recall0 + (4.0 / 9) * recall1 + (1.0 / 9) * recall2)) < delta)
+    assert(math.abs(metrics.weightedFMeasure -
+      ((4.0 / 9) * f1measure0 + (4.0 / 9) * f1measure1 + (1.0 / 9) * f1measure2)) < delta)
+    assert(math.abs(metrics.weightedFMeasure(2.0) -
+      ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta)
+    assert(metrics.labels.sameElements(labels))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
new file mode 100644
index 0000000000000..a599e0d938569
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class HashingTFSuite extends FunSuite with LocalSparkContext {
+
+  test("hashing tf on a single doc") {
+    val hashingTF = new HashingTF(1000)
+    val doc = "a a b b c d".split(" ")
+    val n = hashingTF.numFeatures
+    val termFreqs = Seq(
+      (hashingTF.indexOf("a"), 2.0),
+      (hashingTF.indexOf("b"), 2.0),
+      (hashingTF.indexOf("c"), 1.0),
+      (hashingTF.indexOf("d"), 1.0))
+    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
+      "index must be in range [0, #features)")
+    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
+    val expected = Vectors.sparse(n, termFreqs)
+    assert(hashingTF.transform(doc) === expected)
+  }
+
+  test("hashing tf on an RDD") {
+    val hashingTF = new HashingTF
+    val localDocs: Seq[Seq[String]] = Seq(
+      "a a b b b c d".split(" "),
+      "a b c d a b c".split(" "),
+      "c b a c b a a".split(" "))
+    val docs = sc.parallelize(localDocs, 2)
+    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
new file mode 100644
index 0000000000000..78a2804ff204b
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class IDFSuite extends FunSuite with LocalSparkContext {
+
+  test("idf") {
+    val n = 4
+    val localTermFrequencies = Seq(
+      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
+      Vectors.dense(0.0, 1.0, 2.0, 3.0),
+      Vectors.sparse(n, Array(1), Array(1.0))
+    )
+    val m = localTermFrequencies.size
+    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
+    val idf = new IDF
+    intercept[IllegalStateException] {
+      idf.idf()
+    }
+    intercept[IllegalStateException] {
+      idf.transform(termFrequencies)
+    }
+    idf.fit(termFrequencies)
+    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
+      math.log((m.toDouble + 1.0) / (x + 1.0))
+    })
+    assert(idf.idf() ~== expected absTol 1e-12)
+    val tfidf = idf.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap()
+    assert(tfidf.size === 3)
+    val tfidf0 = tfidf(0L).asInstanceOf[SparseVector]
+    assert(tfidf0.indices === Array(1, 3))
+    assert(Vectors.dense(tfidf0.values) ~==
+      Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
+    val tfidf1 = tfidf(1L).asInstanceOf[DenseVector]
+    assert(Vectors.dense(tfidf1.values) ~==
+      Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
+    val tfidf2 = tfidf(2L).asInstanceOf[SparseVector]
+    assert(tfidf2.indices === Array(1))
+    assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index c9f9acf4c1335..325b817980f68 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.mllib.linalg.distributed
 
-import org.scalatest.FunSuite
+import scala.util.Random
 
 import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, svd => brzSvd}
+import org.scalatest.FunSuite
 
-import org.apache.spark.mllib.util.LocalSparkContext
 import org.apache.spark.mllib.linalg.{Matrices, Vectors, Vector}
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
 
 class RowMatrixSuite extends FunSuite with LocalSparkContext {
 
@@ -96,37 +97,44 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext {
 
   test("svd of a full-rank matrix") {
     for (mat <- Seq(denseMat, sparseMat)) {
-      val localMat = mat.toBreeze()
-      val (localU, localSigma, localVt) = brzSvd(localMat)
-      val localV: BDM[Double] = localVt.t.toDenseMatrix
-      for (k <- 1 to n) {
-        val svd = mat.computeSVD(k, computeU = true)
-        val U = svd.U
-        val s = svd.s
-        val V = svd.V
-        assert(U.numRows() === m)
-        assert(U.numCols() === k)
-        assert(s.size === k)
-        assert(V.numRows === n)
-        assert(V.numCols === k)
-        assertColumnEqualUpToSign(U.toBreeze(), localU, k)
-        assertColumnEqualUpToSign(V.toBreeze.asInstanceOf[BDM[Double]], localV, k)
-        assert(closeToZero(s.toBreeze.asInstanceOf[BDV[Double]] - localSigma(0 until k)))
+      for (mode <- Seq("auto", "local-svd", "local-eigs", "dist-eigs")) {
+        val localMat = mat.toBreeze()
+        val (localU, localSigma, localVt) = brzSvd(localMat)
+        val localV: BDM[Double] = localVt.t.toDenseMatrix
+        for (k <- 1 to n) {
+          val skip = (mode == "local-eigs" || mode == "dist-eigs") && k == n
+          if (!skip) {
+            val svd = mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode)
+            val U = svd.U
+            val s = svd.s
+            val V = svd.V
+            assert(U.numRows() === m)
+            assert(U.numCols() === k)
+            assert(s.size === k)
+            assert(V.numRows === n)
+            assert(V.numCols === k)
+            assertColumnEqualUpToSign(U.toBreeze(), localU, k)
+            assertColumnEqualUpToSign(V.toBreeze.asInstanceOf[BDM[Double]], localV, k)
+            assert(closeToZero(s.toBreeze.asInstanceOf[BDV[Double]] - localSigma(0 until k)))
+          }
+        }
+        val svdWithoutU = mat.computeSVD(1, computeU = false, 1e-9, 300, 1e-10, mode)
+        assert(svdWithoutU.U === null)
       }
-      val svdWithoutU = mat.computeSVD(n)
-      assert(svdWithoutU.U === null)
     }
   }
 
   test("svd of a low-rank matrix") {
-    val rows = sc.parallelize(Array.fill(4)(Vectors.dense(1.0, 1.0)), 2)
-    val mat = new RowMatrix(rows, 4, 2)
-    val svd = mat.computeSVD(2, computeU = true)
-    assert(svd.s.size === 1, "should not return zero singular values")
-    assert(svd.U.numRows() === 4)
-    assert(svd.U.numCols() === 1)
-    assert(svd.V.numRows === 2)
-    assert(svd.V.numCols === 1)
+    val rows = sc.parallelize(Array.fill(4)(Vectors.dense(1.0, 1.0, 1.0)), 2)
+    val mat = new RowMatrix(rows, 4, 3)
+    for (mode <- Seq("auto", "local-svd", "local-eigs", "dist-eigs")) {
+      val svd = mat.computeSVD(2, computeU = true, 1e-6, 300, 1e-10, mode)
+      assert(svd.s.size === 1, s"should not return zero singular values but got ${svd.s}")
+      assert(svd.U.numRows() === 4)
+      assert(svd.U.numCols() === 1)
+      assert(svd.V.numRows === 3)
+      assert(svd.V.numCols === 1)
+    }
   }
 
   def closeToZero(G: BDM[Double]): Boolean = {
@@ -186,3 +194,27 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext {
     }
   }
 }
+
+class RowMatrixClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  var mat: RowMatrix = _
+
+  override def beforeAll() {
+    super.beforeAll()
+    val m = 4
+    val n = 200000
+    val rows = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => Vectors.dense(Array.fill(n)(random.nextDouble())))
+    }
+    mat = new RowMatrix(rows)
+  }
+
+  test("task size should be small in svd") {
+    val svd = mat.computeSVD(1, computeU = true)
+  }
+
+  test("task size should be small in summarize") {
+    val summary = mat.computeColumnSummaryStatistics()
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
index 951b4f7c6e6f4..bf040110e228b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.mllib.optimization
 
-import scala.util.Random
 import scala.collection.JavaConversions._
+import scala.util.Random
 
-import org.scalatest.FunSuite
-import org.scalatest.Matchers
+import org.scalatest.{FunSuite, Matchers}
 
-import org.apache.spark.mllib.regression._
-import org.apache.spark.mllib.util.LocalSparkContext
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
 
 object GradientDescentSuite {
 
@@ -46,7 +46,7 @@ object GradientDescentSuite {
     val rnd = new Random(seed)
     val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 
-    val unifRand = new scala.util.Random(45)
+    val unifRand = new Random(45)
     val rLogis = (0 until nPoints).map { i =>
       val u = unifRand.nextDouble()
       math.log(u) - math.log(1.0-u)
@@ -127,20 +127,38 @@ class GradientDescentSuite extends FunSuite with LocalSparkContext with Matchers
     val (newWeights1, loss1) = GradientDescent.runMiniBatchSGD(
       dataRDD, gradient, updater, 1, 1, regParam1, 1.0, initialWeightsWithIntercept)
 
-    def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
-      math.abs(x - y) / (math.abs(y) + 1e-15) < tol
-    }
-
-    assert(compareDouble(
-      loss1(0),
-      loss0(0) + (math.pow(initialWeightsWithIntercept(0), 2) +
-        math.pow(initialWeightsWithIntercept(1), 2)) / 2),
+    assert(
+      loss1(0) ~= (loss0(0) + (math.pow(initialWeightsWithIntercept(0), 2) +
+        math.pow(initialWeightsWithIntercept(1), 2)) / 2) absTol 1E-5,
       """For non-zero weights, the regVal should be \frac{1}{2}\sum_i w_i^2.""")
 
     assert(
-      compareDouble(newWeights1(0) , newWeights0(0) - initialWeightsWithIntercept(0)) &&
-      compareDouble(newWeights1(1) , newWeights0(1) - initialWeightsWithIntercept(1)),
+      (newWeights1(0) ~= (newWeights0(0) - initialWeightsWithIntercept(0)) absTol 1E-5) &&
+      (newWeights1(1) ~= (newWeights0(1) - initialWeightsWithIntercept(1)) absTol 1E-5),
       "The different between newWeights with/without regularization " +
         "should be initialWeightsWithIntercept.")
   }
 }
+
+class GradientDescentClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => (1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
+    }.cache()
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val (weights, loss) = GradientDescent.runMiniBatchSGD(
+      points,
+      new LogisticGradient,
+      new SquaredL2Updater,
+      0.1,
+      2,
+      1.0,
+      1.0,
+      Vectors.dense(new Array[Double](n)))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
index fe7a9033cd5f4..5f4c24115ac80 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.mllib.optimization
 
-import org.scalatest.FunSuite
-import org.scalatest.Matchers
+import scala.util.Random
+
+import org.scalatest.{FunSuite, Matchers}
 
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LocalSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
 
 class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
 
@@ -48,10 +50,6 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
 
   lazy val dataRDD = sc.parallelize(data, 2).cache()
 
-  def compareDouble(x: Double, y: Double, tol: Double = 1E-3): Boolean = {
-    math.abs(x - y) / (math.abs(y) + 1e-15) < tol
-  }
-
   test("LBFGS loss should be decreasing and match the result of Gradient Descent.") {
     val regParam = 0
 
@@ -125,15 +123,15 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
       miniBatchFrac,
       initialWeightsWithIntercept)
 
-    assert(compareDouble(lossGD(0), lossLBFGS(0)),
+    assert(lossGD(0) ~= lossLBFGS(0) absTol 1E-5,
       "The first losses of LBFGS and GD should be the same.")
 
     // The 2% difference here is based on observation, but is not theoretically guaranteed.
-    assert(compareDouble(lossGD.last, lossLBFGS.last, 0.02),
+    assert(lossGD.last ~= lossLBFGS.last relTol 0.02,
       "The last losses of LBFGS and GD should be within 2% difference.")
 
-    assert(compareDouble(weightLBFGS(0), weightGD(0), 0.02) &&
-      compareDouble(weightLBFGS(1), weightGD(1), 0.02),
+    assert(
+      (weightLBFGS(0) ~= weightGD(0) relTol 0.02) && (weightLBFGS(1) ~= weightGD(1) relTol 0.02),
       "The weight differences between LBFGS and GD should be within 2%.")
   }
 
@@ -225,8 +223,29 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
       initialWeightsWithIntercept)
 
     // for class LBFGS and the optimize method, we only look at the weights
-    assert(compareDouble(weightLBFGS(0), weightGD(0), 0.02) &&
-      compareDouble(weightLBFGS(1), weightGD(1), 0.02),
+    assert(
+      (weightLBFGS(0) ~= weightGD(0) relTol 0.02) && (weightLBFGS(1) ~= weightGD(1) relTol 0.02),
       "The weight differences between LBFGS and GD should be within 2%.")
   }
 }
+
+class LBFGSClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small") {
+    val m = 10
+    val n = 200000
+    val examples = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => (1.0, Vectors.dense(Array.fill(n)(random.nextDouble))))
+    }.cache()
+    val lbfgs = new LBFGS(new LogisticGradient, new SquaredL2Updater)
+      .setNumCorrections(1)
+      .setConvergenceTol(1e-12)
+      .setMaxNumIterations(1)
+      .setRegParam(1.0)
+    val random = new Random(0)
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val weights = lbfgs.optimize(examples, Vectors.dense(Array.fill(n)(random.nextDouble)))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
index bbf385229081a..b781a6aed9a8c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/NNLSSuite.scala
@@ -21,7 +21,9 @@ import scala.util.Random
 
 import org.scalatest.FunSuite
 
-import org.jblas.{DoubleMatrix, SimpleBlas, NativeBlas}
+import org.jblas.{DoubleMatrix, SimpleBlas}
+
+import org.apache.spark.mllib.util.TestingUtils._
 
 class NNLSSuite extends FunSuite {
   /** Generate an NNLS problem whose optimal solution is the all-ones vector. */
@@ -73,7 +75,7 @@ class NNLSSuite extends FunSuite {
     val ws = NNLS.createWorkspace(n)
     val x = NNLS.solve(ata, atb, ws)
     for (i <- 0 until n) {
-      assert(Math.abs(x(i) - goodx(i)) < 1e-3)
+      assert(x(i) ~== goodx(i) absTol 1E-3)
       assert(x(i) >= 0)
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
new file mode 100644
index 0000000000000..3df7c128af5ab
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.random
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.util.StatCounter
+
+// TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
+class RandomDataGeneratorSuite extends FunSuite {
+
+  def apiChecks(gen: RandomDataGenerator[Double]) {
+
+    // resetting seed should generate the same sequence of random numbers
+    gen.setSeed(42L)
+    val array1 = (0 until 1000).map(_ => gen.nextValue())
+    gen.setSeed(42L)
+    val array2 = (0 until 1000).map(_ => gen.nextValue())
+    assert(array1.equals(array2))
+
+    // newInstance should contain a difference instance of the rng
+    // i.e. setting difference seeds for difference instances produces different sequences of
+    // random numbers.
+    val gen2 = gen.copy()
+    gen.setSeed(0L)
+    val array3 = (0 until 1000).map(_ => gen.nextValue())
+    gen2.setSeed(1L)
+    val array4 = (0 until 1000).map(_ => gen2.nextValue())
+    // Compare arrays instead of elements since individual elements can coincide by chance but the
+    // sequences should differ given two different seeds.
+    assert(!array3.equals(array4))
+
+    // test that setting the same seed in the copied instance produces the same sequence of numbers
+    gen.setSeed(0L)
+    val array5 = (0 until 1000).map(_ => gen.nextValue())
+    gen2.setSeed(0L)
+    val array6 = (0 until 1000).map(_ => gen2.nextValue())
+    assert(array5.equals(array6))
+  }
+
+  def distributionChecks(gen: RandomDataGenerator[Double],
+      mean: Double = 0.0,
+      stddev: Double = 1.0,
+      epsilon: Double = 0.01) {
+    for (seed <- 0 until 5) {
+      gen.setSeed(seed.toLong)
+      val sample = (0 until 100000).map { _ => gen.nextValue()}
+      val stats = new StatCounter(sample)
+      assert(math.abs(stats.mean - mean) < epsilon)
+      assert(math.abs(stats.stdev - stddev) < epsilon)
+    }
+  }
+
+  test("UniformGenerator") {
+    val uniform = new UniformGenerator()
+    apiChecks(uniform)
+    // Stddev of uniform distribution = (ub - lb) / math.sqrt(12)
+    distributionChecks(uniform, 0.5, 1 / math.sqrt(12))
+  }
+
+  test("StandardNormalGenerator") {
+    val normal = new StandardNormalGenerator()
+    apiChecks(normal)
+    distributionChecks(normal, 0.0, 1.0)
+  }
+
+  test("PoissonGenerator") {
+    // mean = 0.0 will not pass the API checks since 0.0 is always deterministically produced.
+    for (mean <- List(1.0, 5.0, 100.0)) {
+      val poisson = new PoissonGenerator(mean)
+      apiChecks(poisson)
+      distributionChecks(poisson, mean, math.sqrt(mean), 0.1)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDGeneratorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDGeneratorsSuite.scala
new file mode 100644
index 0000000000000..96e0bc63b0fa4
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDGeneratorsSuite.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.random
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.rdd.{RandomRDDPartition, RandomRDD}
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.StatCounter
+
+/*
+ * Note: avoid including APIs that do not set the seed for the RNG in unit tests
+ * in order to guarantee deterministic behavior.
+ *
+ * TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
+ */
+class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Serializable {
+
+  def testGeneratedRDD(rdd: RDD[Double],
+      expectedSize: Long,
+      expectedNumPartitions: Int,
+      expectedMean: Double,
+      expectedStddev: Double,
+      epsilon: Double = 0.01) {
+    val stats = rdd.stats()
+    assert(expectedSize === stats.count)
+    assert(expectedNumPartitions === rdd.partitions.size)
+    assert(math.abs(stats.mean - expectedMean) < epsilon)
+    assert(math.abs(stats.stdev - expectedStddev) < epsilon)
+  }
+
+  // assume test RDDs are small
+  def testGeneratedVectorRDD(rdd: RDD[Vector],
+      expectedRows: Long,
+      expectedColumns: Int,
+      expectedNumPartitions: Int,
+      expectedMean: Double,
+      expectedStddev: Double,
+      epsilon: Double = 0.01) {
+    assert(expectedNumPartitions === rdd.partitions.size)
+    val values = new ArrayBuffer[Double]()
+    rdd.collect.foreach { vector => {
+      assert(vector.size === expectedColumns)
+      values ++= vector.toArray
+    }}
+    assert(expectedRows === values.size / expectedColumns)
+    val stats = new StatCounter(values)
+    assert(math.abs(stats.mean - expectedMean) < epsilon)
+    assert(math.abs(stats.stdev - expectedStddev) < epsilon)
+  }
+
+  test("RandomRDD sizes") {
+
+    // some cases where size % numParts != 0 to test getPartitions behaves correctly
+    for ((size, numPartitions) <- List((10000, 6), (12345, 1), (1000, 101))) {
+      val rdd = new RandomRDD(sc, size, numPartitions, new UniformGenerator, 0L)
+      assert(rdd.count() === size)
+      assert(rdd.partitions.size === numPartitions)
+
+      // check that partition sizes are balanced
+      val partSizes = rdd.partitions.map(p =>
+        p.asInstanceOf[RandomRDDPartition[Double]].size.toDouble)
+
+      val partStats = new StatCounter(partSizes)
+      assert(partStats.max - partStats.min <= 1)
+    }
+
+    // size > Int.MaxValue
+    val size = Int.MaxValue.toLong * 100L
+    val numPartitions = 101
+    val rdd = new RandomRDD(sc, size, numPartitions, new UniformGenerator, 0L)
+    assert(rdd.partitions.size === numPartitions)
+    val count = rdd.partitions.foldLeft(0L) { (count, part) =>
+      count + part.asInstanceOf[RandomRDDPartition[Double]].size
+    }
+    assert(count === size)
+
+    // size needs to be positive
+    intercept[IllegalArgumentException] { new RandomRDD(sc, 0, 10, new UniformGenerator, 0L) }
+
+    // numPartitions needs to be positive
+    intercept[IllegalArgumentException] { new RandomRDD(sc, 100, 0, new UniformGenerator, 0L) }
+
+    // partition size needs to be <= Int.MaxValue
+    intercept[IllegalArgumentException] {
+      new RandomRDD(sc, Int.MaxValue.toLong * 100L, 99, new UniformGenerator, 0L)
+    }
+  }
+
+  test("randomRDD for different distributions") {
+    val size = 100000L
+    val numPartitions = 10
+    val poissonMean = 100.0
+
+    for (seed <- 0 until 5) {
+      val uniform = RandomRDDGenerators.uniformRDD(sc, size, numPartitions, seed)
+      testGeneratedRDD(uniform, size, numPartitions, 0.5, 1 / math.sqrt(12))
+
+      val normal = RandomRDDGenerators.normalRDD(sc, size, numPartitions, seed)
+      testGeneratedRDD(normal, size, numPartitions, 0.0, 1.0)
+
+      val poisson = RandomRDDGenerators.poissonRDD(sc, poissonMean, size, numPartitions, seed)
+      testGeneratedRDD(poisson, size, numPartitions, poissonMean, math.sqrt(poissonMean), 0.1)
+    }
+
+    // mock distribution to check that partitions have unique seeds
+    val random = RandomRDDGenerators.randomRDD(sc, new MockDistro(), 1000L, 1000, 0L)
+    assert(random.collect.size === random.collect.distinct.size)
+  }
+
+  test("randomVectorRDD for different distributions") {
+    val rows = 1000L
+    val cols = 100
+    val parts = 10
+    val poissonMean = 100.0
+
+    for (seed <- 0 until 5) {
+      val uniform = RandomRDDGenerators.uniformVectorRDD(sc, rows, cols, parts, seed)
+      testGeneratedVectorRDD(uniform, rows, cols, parts, 0.5, 1 / math.sqrt(12))
+
+      val normal = RandomRDDGenerators.normalVectorRDD(sc, rows, cols, parts, seed)
+      testGeneratedVectorRDD(normal, rows, cols, parts, 0.0, 1.0)
+
+      val poisson = RandomRDDGenerators.poissonVectorRDD(sc, poissonMean, rows, cols, parts, seed)
+      testGeneratedVectorRDD(poisson, rows, cols, parts, poissonMean, math.sqrt(poissonMean), 0.1)
+    }
+  }
+}
+
+private[random] class MockDistro extends RandomDataGenerator[Double] {
+
+  var seed = 0L
+
+  // This allows us to check that each partition has a different seed
+  override def nextValue(): Double = seed.toDouble
+
+  override def setSeed(seed: Long) = this.seed = seed
+
+  override def copy(): MockDistro = new MockDistro
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
index 3f3b10dfff35e..27a19f793242b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
@@ -46,4 +46,22 @@ class RDDFunctionsSuite extends FunSuite with LocalSparkContext {
     val expected = data.flatMap(x => x).sliding(3).toList
     assert(sliding.collect().toList === expected)
   }
+
+  test("treeAggregate") {
+    val rdd = sc.makeRDD(-1000 until 1000, 10)
+    def seqOp = (c: Long, x: Int) => c + x
+    def combOp = (c1: Long, c2: Long) => c1 + c2
+    for (depth <- 1 until 10) {
+      val sum = rdd.treeAggregate(0L)(seqOp, combOp, depth)
+      assert(sum === -1000L)
+    }
+  }
+
+  test("treeReduce") {
+    val rdd = sc.makeRDD(-1000 until 1000, 10)
+    for (depth <- 1 until 10) {
+      val sum = rdd.treeReduce(_ + _, depth)
+      assert(sum === -1000)
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index 81bebec8c7a39..017c39edb185f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -22,11 +22,11 @@ import scala.math.abs
 import scala.util.Random
 
 import org.scalatest.FunSuite
-
 import org.jblas.DoubleMatrix
 
-import org.apache.spark.mllib.util.LocalSparkContext
 import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.recommendation.ALS.BlockStats
 
 object ALSSuite {
 
@@ -67,8 +67,10 @@ object ALSSuite {
       case true =>
         // Generate raw values from [0,9], or if negativeWeights, from [-2,7]
         val raw = new DoubleMatrix(users, products,
-          Array.fill(users * products)((if (negativeWeights) -2 else 0) + rand.nextInt(10).toDouble): _*)
-        val prefs = new DoubleMatrix(users, products, raw.data.map(v => if (v > 0) 1.0 else 0.0): _*)
+          Array.fill(users * products)(
+            (if (negativeWeights) -2 else 0) + rand.nextInt(10).toDouble): _*)
+        val prefs =
+          new DoubleMatrix(users, products, raw.data.map(v => if (v > 0) 1.0 else 0.0): _*)
         (raw, prefs)
       case false => (userMatrix.mmul(productMatrix), null)
     }
@@ -160,6 +162,22 @@ class ALSSuite extends FunSuite with LocalSparkContext {
     testALS(100, 200, 2, 15, 0.7, 0.4, false, false, false, -1, -1, false)
   }
 
+  test("analyze one user block and one product block") {
+    val localRatings = Seq(
+      Rating(0, 100, 1.0),
+      Rating(0, 101, 2.0),
+      Rating(0, 102, 3.0),
+      Rating(1, 102, 4.0),
+      Rating(2, 103, 5.0))
+    val ratings = sc.makeRDD(localRatings, 2)
+    val stats = ALS.analyzeBlocks(ratings, 1, 1)
+    assert(stats.size === 2)
+    assert(stats(0) === BlockStats("user", 0, 3, 5, 4, 3))
+    assert(stats(1) === BlockStats("product", 0, 4, 5, 3, 4))
+  }
+
+  // TODO: add tests for analyzing multiple user/product blocks
+
   /**
    * Test if we can correctly factorize R = U * P where U and P are of known rank.
    *
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
index bfa42959c8ead..7aa96421aed87 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.mllib.regression
 
+import scala.util.Random
+
 import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext}
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LinearDataGenerator,
+  LocalSparkContext}
 
 class LassoSuite extends FunSuite with LocalSparkContext {
 
@@ -113,3 +116,19 @@ class LassoSuite extends FunSuite with LocalSparkContext {
     validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
   }
 }
+
+class LassoClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
+    }.cache()
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val model = LassoWithSGD.train(points, 2)
+    val predictions = model.predict(points.map(_.features))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
index 7aaad7d7a3e39..4f89112b650c5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.mllib.regression
 
+import scala.util.Random
+
 import org.scalatest.FunSuite
 
 import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext}
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LinearDataGenerator,
+  LocalSparkContext}
 
 class LinearRegressionSuite extends FunSuite with LocalSparkContext {
 
@@ -122,3 +125,19 @@ class LinearRegressionSuite extends FunSuite with LocalSparkContext {
       sparseValidationData.map(row => model.predict(row.features)), sparseValidationData)
   }
 }
+
+class LinearRegressionClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
+    }.cache()
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val model = LinearRegressionWithSGD.train(points, 2)
+    val predictions = model.predict(points.map(_.features))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
index 67768e17fbe6d..727bbd051ff15 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.mllib.regression
 
-import org.scalatest.FunSuite
+import scala.util.Random
 
 import org.jblas.DoubleMatrix
+import org.scalatest.FunSuite
 
-import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.{LocalClusterSparkContext, LinearDataGenerator,
+  LocalSparkContext}
 
 class RidgeRegressionSuite extends FunSuite with LocalSparkContext {
 
@@ -73,3 +76,19 @@ class RidgeRegressionSuite extends FunSuite with LocalSparkContext {
       "ridgeError (" + ridgeErr + ") was not less than linearError(" + linearErr + ")")
   }
 }
+
+class RidgeRegressionClusterSuite extends FunSuite with LocalClusterSparkContext {
+
+  test("task size should be small in both training and prediction") {
+    val m = 4
+    val n = 200000
+    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
+      val random = new Random(idx)
+      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
+    }.cache()
+    // If we serialize data directly in the task closure, the size of the serialized task would be
+    // greater than 1MB and hence Spark would throw an error.
+    val model = RidgeRegressionWithSGD.train(points, 2)
+    val predictions = model.predict(points.map(_.features))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
new file mode 100644
index 0000000000000..ed21f84472c9a
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.regression
+
+import java.io.File
+import java.nio.charset.Charset
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.google.common.io.Files
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.{LinearDataGenerator, LocalSparkContext, MLUtils}
+import org.apache.spark.streaming.{Milliseconds, StreamingContext}
+import org.apache.spark.util.Utils
+
+class StreamingLinearRegressionSuite extends FunSuite with LocalSparkContext {
+
+  // Assert that two values are equal within tolerance epsilon
+  def assertEqual(v1: Double, v2: Double, epsilon: Double) {
+    def errorMessage = v1.toString + " did not equal " + v2.toString
+    assert(math.abs(v1-v2) <= epsilon, errorMessage)
+  }
+
+  // Assert that model predictions are correct
+  def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
+    val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
+      // A prediction is off if the prediction is more than 0.5 away from expected value.
+      math.abs(prediction - expected.label) > 0.5
+    }
+    // At least 80% of the predictions should be on.
+    assert(numOffPredictions < input.length / 5)
+  }
+
+  // Test if we can accurately learn Y = 10*X1 + 10*X2 on streaming data
+  test("streaming linear regression parameter accuracy") {
+
+    val testDir = Files.createTempDir()
+    val numBatches = 10
+    val batchDuration = Milliseconds(1000)
+    val ssc = new StreamingContext(sc, batchDuration)
+    val data = MLUtils.loadStreamingLabeledPoints(ssc, testDir.toString)
+    val model = new StreamingLinearRegressionWithSGD()
+      .setInitialWeights(Vectors.dense(0.0, 0.0))
+      .setStepSize(0.1)
+      .setNumIterations(50)
+
+    model.trainOn(data)
+
+    ssc.start()
+
+    // write data to a file stream
+    for (i <- 0 until numBatches) {
+      val samples = LinearDataGenerator.generateLinearInput(
+        0.0, Array(10.0, 10.0), 100, 42 * (i + 1))
+      val file = new File(testDir, i.toString)
+      Files.write(samples.map(x => x.toString).mkString("\n"), file, Charset.forName("UTF-8"))
+      Thread.sleep(batchDuration.milliseconds)
+    }
+
+    ssc.stop(stopSparkContext=false)
+
+    System.clearProperty("spark.driver.port")
+    Utils.deleteRecursively(testDir)
+
+    // check accuracy of final parameter estimates
+    assertEqual(model.latestModel().intercept, 0.0, 0.1)
+    assertEqual(model.latestModel().weights(0), 10.0, 0.1)
+    assertEqual(model.latestModel().weights(1), 10.0, 0.1)
+
+    // check accuracy of predictions
+    val validationData = LinearDataGenerator.generateLinearInput(0.0, Array(10.0, 10.0), 100, 17)
+    validatePrediction(validationData.map(row => model.latestModel().predict(row.features)),
+      validationData)
+  }
+
+  // Test that parameter estimates improve when learning Y = 10*X1 on streaming data
+  test("streaming linear regression parameter convergence") {
+
+    val testDir = Files.createTempDir()
+    val batchDuration = Milliseconds(2000)
+    val ssc = new StreamingContext(sc, batchDuration)
+    val numBatches = 5
+    val data = MLUtils.loadStreamingLabeledPoints(ssc, testDir.toString)
+    val model = new StreamingLinearRegressionWithSGD()
+      .setInitialWeights(Vectors.dense(0.0))
+      .setStepSize(0.1)
+      .setNumIterations(50)
+
+    model.trainOn(data)
+
+    ssc.start()
+
+    // write data to a file stream
+    val history = new ArrayBuffer[Double](numBatches)
+    for (i <- 0 until numBatches) {
+      val samples = LinearDataGenerator.generateLinearInput(0.0, Array(10.0), 100, 42 * (i + 1))
+      val file = new File(testDir, i.toString)
+      Files.write(samples.map(x => x.toString).mkString("\n"), file, Charset.forName("UTF-8"))
+      Thread.sleep(batchDuration.milliseconds)
+      // wait an extra few seconds to make sure the update finishes before new data arrive
+      Thread.sleep(4000)
+      history.append(math.abs(model.latestModel().weights(0) - 10.0))
+    }
+
+    ssc.stop(stopSparkContext=false)
+
+    System.clearProperty("spark.driver.port")
+    Utils.deleteRecursively(testDir)
+
+    val deltas = history.drop(1).zip(history.dropRight(1))
+    // check error stability (it always either shrinks, or increases with small tol)
+    assert(deltas.forall(x => (x._1 - x._2) <= 0.1))
+    // check that error shrunk on at least 2 batches
+    assert(deltas.map(x => if ((x._1 - x._2) < 0) 1 else 0).sum > 1)
+
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
new file mode 100644
index 0000000000000..a3f76f77a5dcc
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.scalatest.FunSuite
+
+import breeze.linalg.{DenseMatrix => BDM, Matrix => BM}
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
+  SpearmanCorrelation}
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class CorrelationSuite extends FunSuite with LocalSparkContext {
+
+  // test input data
+  val xData = Array(1.0, 0.0, -2.0)
+  val yData = Array(4.0, 5.0, 3.0)
+  val zeros = new Array[Double](3)
+  val data = Seq(
+    Vectors.dense(1.0, 0.0, 0.0, -2.0),
+    Vectors.dense(4.0, 5.0, 0.0, 3.0),
+    Vectors.dense(6.0, 7.0, 0.0, 8.0),
+    Vectors.dense(9.0, 0.0, 0.0, 1.0)
+  )
+
+  test("corr(x, y) default, pearson") {
+    val x = sc.parallelize(xData)
+    val y = sc.parallelize(yData)
+    val expected = 0.6546537
+    val default = Statistics.corr(x, y)
+    val p1 = Statistics.corr(x, y, "pearson")
+    assert(approxEqual(expected, default))
+    assert(approxEqual(expected, p1))
+
+    // numPartitions >= size for input RDDs
+    for (numParts <- List(xData.size, xData.size * 2)) {
+      val x1 = sc.parallelize(xData, numParts)
+      val y1 = sc.parallelize(yData, numParts)
+      val p2 = Statistics.corr(x1, y1)
+      assert(approxEqual(expected, p2))
+    }
+
+    // RDD of zero variance
+    val z = sc.parallelize(zeros)
+    assert(Statistics.corr(x, z).isNaN())
+  }
+
+  test("corr(x, y) spearman") {
+    val x = sc.parallelize(xData)
+    val y = sc.parallelize(yData)
+    val expected = 0.5
+    val s1 = Statistics.corr(x, y, "spearman")
+    assert(approxEqual(expected, s1))
+
+    // numPartitions >= size for input RDDs
+    for (numParts <- List(xData.size, xData.size * 2)) {
+      val x1 = sc.parallelize(xData, numParts)
+      val y1 = sc.parallelize(yData, numParts)
+      val s2 = Statistics.corr(x1, y1, "spearman")
+      assert(approxEqual(expected, s2))
+    }
+
+    // RDD of zero variance => zero variance in ranks
+    val z = sc.parallelize(zeros)
+    assert(Statistics.corr(x, z, "spearman").isNaN())
+  }
+
+  test("corr(X) default, pearson") {
+    val X = sc.parallelize(data)
+    val defaultMat = Statistics.corr(X)
+    val pearsonMat = Statistics.corr(X, "pearson")
+    val expected = BDM(
+      (1.00000000, 0.05564149, Double.NaN, 0.4004714),
+      (0.05564149, 1.00000000, Double.NaN, 0.9135959),
+      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
+      (0.40047142, 0.91359586, Double.NaN,1.0000000))
+    assert(matrixApproxEqual(defaultMat.toBreeze, expected))
+    assert(matrixApproxEqual(pearsonMat.toBreeze, expected))
+  }
+
+  test("corr(X) spearman") {
+    val X = sc.parallelize(data)
+    val spearmanMat = Statistics.corr(X, "spearman")
+    val expected = BDM(
+      (1.0000000,  0.1054093,  Double.NaN, 0.4000000),
+      (0.1054093,  1.0000000,  Double.NaN, 0.9486833),
+      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
+      (0.4000000,  0.9486833,  Double.NaN, 1.0000000))
+    assert(matrixApproxEqual(spearmanMat.toBreeze, expected))
+  }
+
+  test("method identification") {
+    val pearson = PearsonCorrelation
+    val spearman = SpearmanCorrelation
+
+    assert(Correlations.getCorrelationFromName("pearson") === pearson)
+    assert(Correlations.getCorrelationFromName("spearman") === spearman)
+
+    // Should throw IllegalArgumentException
+    try {
+      Correlations.getCorrelationFromName("kendall")
+      assert(false)
+    } catch {
+      case ie: IllegalArgumentException =>
+    }
+  }
+
+  def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = {
+    if (v1.isNaN) {
+      v2.isNaN
+    } else {
+      math.abs(v1 - v2) <= threshold
+    }
+  }
+
+  def matrixApproxEqual(A: BM[Double], B: BM[Double], threshold: Double = 1e-6): Boolean = {
+    for (i <- 0 until A.rows; j <- 0 until A.cols) {
+      if (!approxEqual(A(i, j), B(i, j), threshold)) {
+        println("i, j = " + i + ", " + j + " actual: " + A(i, j) + " expected:" + B(i, j))
+        return false
+      }
+    }
+    true
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
new file mode 100644
index 0000000000000..db13f142df517
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.TestingUtils._
+
+class MultivariateOnlineSummarizerSuite extends FunSuite {
+
+  test("basic error handing") {
+    val summarizer = new MultivariateOnlineSummarizer
+
+    assert(summarizer.count === 0, "should be zero since nothing is added.")
+
+    withClue("Getting numNonzeros from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.numNonzeros
+      }
+    }
+
+    withClue("Getting variance from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.variance
+      }
+    }
+
+    withClue("Getting mean from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.mean
+      }
+    }
+
+    withClue("Getting max from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.max
+      }
+    }
+
+    withClue("Getting min from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.min
+      }
+    }
+
+    summarizer.add(Vectors.dense(-1.0, 2.0, 6.0)).add(Vectors.sparse(3, Seq((0, -2.0), (1, 6.0))))
+
+    withClue("Adding a new dense sample with different array size should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.add(Vectors.dense(3.0, 1.0))
+      }
+    }
+
+    withClue("Adding a new sparse sample with different array size should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.add(Vectors.sparse(5, Seq((0, -2.0), (1, 6.0))))
+      }
+    }
+
+    val summarizer2 = (new MultivariateOnlineSummarizer).add(Vectors.dense(1.0, -2.0, 0.0, 4.0))
+    withClue("Merging a new summarizer with different dimensions should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.merge(summarizer2)
+      }
+    }
+  }
+
+  test("dense vector input") {
+    // For column 2, the maximum will be 0.0, and it's not explicitly added since we ignore all
+    // the zeros; it's a case we need to test. For column 3, the minimum will be 0.0 which we
+    // need to test as well.
+    val summarizer = (new MultivariateOnlineSummarizer)
+      .add(Vectors.dense(-1.0, 0.0, 6.0))
+      .add(Vectors.dense(3.0, -3.0, 0.0))
+
+    assert(summarizer.mean ~== Vectors.dense(1.0, -1.5, 3.0) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer.min ~== Vectors.dense(-1.0, -3, 0.0) absTol 1E-5, "min mismatch")
+
+    assert(summarizer.max ~== Vectors.dense(3.0, 0.0, 6.0) absTol 1E-5, "max mismatch")
+
+    assert(summarizer.numNonzeros ~== Vectors.dense(2, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(summarizer.variance ~== Vectors.dense(8.0, 4.5, 18.0) absTol 1E-5, "variance mismatch")
+
+    assert(summarizer.count === 2)
+  }
+
+  test("sparse vector input") {
+    val summarizer = (new MultivariateOnlineSummarizer)
+      .add(Vectors.sparse(3, Seq((0, -1.0), (2, 6.0))))
+      .add(Vectors.sparse(3, Seq((0, 3.0), (1, -3.0))))
+
+    assert(summarizer.mean ~== Vectors.dense(1.0, -1.5, 3.0) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer.min ~== Vectors.dense(-1.0, -3, 0.0) absTol 1E-5, "min mismatch")
+
+    assert(summarizer.max ~== Vectors.dense(3.0, 0.0, 6.0) absTol 1E-5, "max mismatch")
+
+    assert(summarizer.numNonzeros ~== Vectors.dense(2, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(summarizer.variance ~== Vectors.dense(8.0, 4.5, 18.0) absTol 1E-5, "variance mismatch")
+
+    assert(summarizer.count === 2)
+  }
+
+  test("mixing dense and sparse vector input") {
+    val summarizer = (new MultivariateOnlineSummarizer)
+      .add(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))))
+      .add(Vectors.dense(0.0, -1.0, -3.0))
+      .add(Vectors.sparse(3, Seq((1, -5.1))))
+      .add(Vectors.dense(3.8, 0.0, 1.9))
+      .add(Vectors.dense(1.7, -0.6, 0.0))
+      .add(Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))
+
+    assert(summarizer.mean ~==
+      Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer.min ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min mismatch")
+
+    assert(summarizer.max ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch")
+
+    assert(summarizer.numNonzeros ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(summarizer.variance ~==
+      Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5, "variance mismatch")
+
+    assert(summarizer.count === 6)
+  }
+
+  test("merging two summarizers") {
+    val summarizer1 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))))
+      .add(Vectors.dense(0.0, -1.0, -3.0))
+
+    val summarizer2 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.sparse(3, Seq((1, -5.1))))
+      .add(Vectors.dense(3.8, 0.0, 1.9))
+      .add(Vectors.dense(1.7, -0.6, 0.0))
+      .add(Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))
+
+    val summarizer = summarizer1.merge(summarizer2)
+
+    assert(summarizer.mean ~==
+      Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer.min ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min mismatch")
+
+    assert(summarizer.max ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch")
+
+    assert(summarizer.numNonzeros ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(summarizer.variance ~==
+      Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5, "variance mismatch")
+
+    assert(summarizer.count === 6)
+  }
+
+  test("merging summarizer with empty summarizer") {
+    // If one of two is non-empty, this should return the non-empty summarizer.
+    // If both of them are empty, then just return the empty summarizer.
+    val summarizer1 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.dense(0.0, -1.0, -3.0)).merge(new MultivariateOnlineSummarizer)
+    assert(summarizer1.count === 1)
+
+    val summarizer2 = (new MultivariateOnlineSummarizer)
+      .merge((new MultivariateOnlineSummarizer).add(Vectors.dense(0.0, -1.0, -3.0)))
+    assert(summarizer2.count === 1)
+
+    val summarizer3 = (new MultivariateOnlineSummarizer).merge(new MultivariateOnlineSummarizer)
+    assert(summarizer3.count === 0)
+
+    assert(summarizer1.mean ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer2.mean ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer1.min ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "min mismatch")
+
+    assert(summarizer2.min ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "min mismatch")
+
+    assert(summarizer1.max ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "max mismatch")
+
+    assert(summarizer2.max ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "max mismatch")
+
+    assert(summarizer1.numNonzeros ~== Vectors.dense(0, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(summarizer2.numNonzeros ~== Vectors.dense(0, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+
+    assert(summarizer1.variance ~== Vectors.dense(0, 0, 0) absTol 1E-5, "variance mismatch")
+
+    assert(summarizer2.variance ~== Vectors.dense(0, 0, 0) absTol 1E-5, "variance mismatch")
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 35e92d71dc63f..70ca7c8a266f2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -17,25 +17,51 @@
 
 package org.apache.spark.mllib.tree
 
+import scala.collection.JavaConverters._
+
 import org.scalatest.FunSuite
 
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Variance}
-import org.apache.spark.mllib.tree.model.Filter
-import org.apache.spark.mllib.tree.model.Split
+import org.apache.spark.mllib.tree.model.{DecisionTreeModel, Filter, Split}
 import org.apache.spark.mllib.tree.configuration.{FeatureType, Strategy}
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.FeatureType._
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.regression.LabeledPoint
 
 class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
+  def validateClassifier(
+      model: DecisionTreeModel,
+      input: Seq[LabeledPoint],
+      requiredAccuracy: Double) {
+    val predictions = input.map(x => model.predict(x.features))
+    val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
+      prediction != expected.label
+    }
+    val accuracy = (input.length - numOffPredictions).toDouble / input.length
+    assert(accuracy >= requiredAccuracy)
+  }
+
+  def validateRegressor(
+      model: DecisionTreeModel,
+      input: Seq[LabeledPoint],
+      requiredMSE: Double) {
+    val predictions = input.map(x => model.predict(x.features))
+    val squaredError = predictions.zip(input).map { case (prediction, expected) =>
+      val err = prediction - expected.label
+      err * err
+    }.sum
+    val mse = squaredError / input.length
+    assert(mse <= requiredMSE)
+  }
+
   test("split and bin calculation") {
     val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel1()
     assert(arr.length === 1000)
     val rdd = sc.parallelize(arr)
-    val strategy = new Strategy(Classification, Gini, 3, 100)
+    val strategy = new Strategy(Classification, Gini, 3, 2, 100)
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     assert(splits.length === 2)
     assert(bins.length === 2)
@@ -50,7 +76,8 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val strategy = new Strategy(
       Classification,
       Gini,
-      maxDepth = 3,
+      maxDepth = 2,
+      numClassesForClassification = 2,
       maxBins = 100,
       categoricalFeaturesInfo = Map(0 -> 2, 1-> 2))
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
@@ -129,9 +156,10 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val strategy = new Strategy(
       Classification,
       Gini,
-      maxDepth = 3,
+      maxDepth = 2,
+      numClassesForClassification = 2,
       maxBins = 100,
-      categoricalFeaturesInfo = Map(0 -> 3, 1-> 3))
+      categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
 
     // Check splits.
@@ -231,6 +259,162 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     assert(bins(1)(3) === null)
   }
 
+  test("extract categories from a number for multiclass classification") {
+    val l = DecisionTree.extractMultiClassCategories(13, 10)
+    assert(l.length === 3)
+    assert(List(3.0, 2.0, 0.0).toSeq === l.toSeq)
+  }
+
+  test("split and bin calculations for unordered categorical variables with multiclass " +
+    "classification") {
+    val arr = DecisionTreeSuite.generateCategoricalDataPoints()
+    assert(arr.length === 1000)
+    val rdd = sc.parallelize(arr)
+    val strategy = new Strategy(
+      Classification,
+      Gini,
+      maxDepth = 2,
+      numClassesForClassification = 100,
+      maxBins = 100,
+      categoricalFeaturesInfo = Map(0 -> 3, 1-> 3))
+    val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
+
+    // Expecting 2^2 - 1 = 3 bins/splits
+    assert(splits(0)(0).feature === 0)
+    assert(splits(0)(0).threshold === Double.MinValue)
+    assert(splits(0)(0).featureType === Categorical)
+    assert(splits(0)(0).categories.length === 1)
+    assert(splits(0)(0).categories.contains(0.0))
+    assert(splits(1)(0).feature === 1)
+    assert(splits(1)(0).threshold === Double.MinValue)
+    assert(splits(1)(0).featureType === Categorical)
+    assert(splits(1)(0).categories.length === 1)
+    assert(splits(1)(0).categories.contains(0.0))
+
+    assert(splits(0)(1).feature === 0)
+    assert(splits(0)(1).threshold === Double.MinValue)
+    assert(splits(0)(1).featureType === Categorical)
+    assert(splits(0)(1).categories.length === 1)
+    assert(splits(0)(1).categories.contains(1.0))
+    assert(splits(1)(1).feature === 1)
+    assert(splits(1)(1).threshold === Double.MinValue)
+    assert(splits(1)(1).featureType === Categorical)
+    assert(splits(1)(1).categories.length === 1)
+    assert(splits(1)(1).categories.contains(1.0))
+
+    assert(splits(0)(2).feature === 0)
+    assert(splits(0)(2).threshold === Double.MinValue)
+    assert(splits(0)(2).featureType === Categorical)
+    assert(splits(0)(2).categories.length === 2)
+    assert(splits(0)(2).categories.contains(0.0))
+    assert(splits(0)(2).categories.contains(1.0))
+    assert(splits(1)(2).feature === 1)
+    assert(splits(1)(2).threshold === Double.MinValue)
+    assert(splits(1)(2).featureType === Categorical)
+    assert(splits(1)(2).categories.length === 2)
+    assert(splits(1)(2).categories.contains(0.0))
+    assert(splits(1)(2).categories.contains(1.0))
+
+    assert(splits(0)(3) === null)
+    assert(splits(1)(3) === null)
+
+
+    // Check bins.
+
+    assert(bins(0)(0).category === Double.MinValue)
+    assert(bins(0)(0).lowSplit.categories.length === 0)
+    assert(bins(0)(0).highSplit.categories.length === 1)
+    assert(bins(0)(0).highSplit.categories.contains(0.0))
+    assert(bins(1)(0).category === Double.MinValue)
+    assert(bins(1)(0).lowSplit.categories.length === 0)
+    assert(bins(1)(0).highSplit.categories.length === 1)
+    assert(bins(1)(0).highSplit.categories.contains(0.0))
+
+    assert(bins(0)(1).category === Double.MinValue)
+    assert(bins(0)(1).lowSplit.categories.length === 1)
+    assert(bins(0)(1).lowSplit.categories.contains(0.0))
+    assert(bins(0)(1).highSplit.categories.length === 1)
+    assert(bins(0)(1).highSplit.categories.contains(1.0))
+    assert(bins(1)(1).category === Double.MinValue)
+    assert(bins(1)(1).lowSplit.categories.length === 1)
+    assert(bins(1)(1).lowSplit.categories.contains(0.0))
+    assert(bins(1)(1).highSplit.categories.length === 1)
+    assert(bins(1)(1).highSplit.categories.contains(1.0))
+
+    assert(bins(0)(2).category === Double.MinValue)
+    assert(bins(0)(2).lowSplit.categories.length === 1)
+    assert(bins(0)(2).lowSplit.categories.contains(1.0))
+    assert(bins(0)(2).highSplit.categories.length === 2)
+    assert(bins(0)(2).highSplit.categories.contains(1.0))
+    assert(bins(0)(2).highSplit.categories.contains(0.0))
+    assert(bins(1)(2).category === Double.MinValue)
+    assert(bins(1)(2).lowSplit.categories.length === 1)
+    assert(bins(1)(2).lowSplit.categories.contains(1.0))
+    assert(bins(1)(2).highSplit.categories.length === 2)
+    assert(bins(1)(2).highSplit.categories.contains(1.0))
+    assert(bins(1)(2).highSplit.categories.contains(0.0))
+
+    assert(bins(0)(3) === null)
+    assert(bins(1)(3) === null)
+
+  }
+
+  test("split and bin calculations for ordered categorical variables with multiclass " +
+    "classification") {
+    val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlassForOrderedFeatures()
+    assert(arr.length === 3000)
+    val rdd = sc.parallelize(arr)
+    val strategy = new Strategy(
+      Classification,
+      Gini,
+      maxDepth = 2,
+      numClassesForClassification = 100,
+      maxBins = 100,
+      categoricalFeaturesInfo = Map(0 -> 10, 1-> 10))
+    val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
+
+    // 2^10 - 1 > 100, so categorical variables will be ordered
+
+    assert(splits(0)(0).feature === 0)
+    assert(splits(0)(0).threshold === Double.MinValue)
+    assert(splits(0)(0).featureType === Categorical)
+    assert(splits(0)(0).categories.length === 1)
+    assert(splits(0)(0).categories.contains(1.0))
+
+    assert(splits(0)(1).feature === 0)
+    assert(splits(0)(1).threshold === Double.MinValue)
+    assert(splits(0)(1).featureType === Categorical)
+    assert(splits(0)(1).categories.length === 2)
+    assert(splits(0)(1).categories.contains(2.0))
+
+    assert(splits(0)(2).feature === 0)
+    assert(splits(0)(2).threshold === Double.MinValue)
+    assert(splits(0)(2).featureType === Categorical)
+    assert(splits(0)(2).categories.length === 3)
+    assert(splits(0)(2).categories.contains(2.0))
+    assert(splits(0)(2).categories.contains(1.0))
+
+    assert(splits(0)(10) === null)
+    assert(splits(1)(10) === null)
+
+
+    // Check bins.
+
+    assert(bins(0)(0).category === 1.0)
+    assert(bins(0)(0).lowSplit.categories.length === 0)
+    assert(bins(0)(0).highSplit.categories.length === 1)
+    assert(bins(0)(0).highSplit.categories.contains(1.0))
+    assert(bins(0)(1).category === 2.0)
+    assert(bins(0)(1).lowSplit.categories.length === 1)
+    assert(bins(0)(1).highSplit.categories.length === 2)
+    assert(bins(0)(1).highSplit.categories.contains(1.0))
+    assert(bins(0)(1).highSplit.categories.contains(2.0))
+
+    assert(bins(0)(10) === null)
+
+  }
+
+
   test("classification stump with all categorical variables") {
     val arr = DecisionTreeSuite.generateCategoricalDataPoints()
     assert(arr.length === 1000)
@@ -238,7 +422,8 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val strategy = new Strategy(
       Classification,
       Gini,
-      maxDepth = 3,
+      numClassesForClassification = 2,
+      maxDepth = 2,
       maxBins = 100,
       categoricalFeaturesInfo = Map(0 -> 3, 1-> 3))
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
@@ -253,8 +438,8 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val stats = bestSplits(0)._2
     assert(stats.gain > 0)
-    assert(stats.predict > 0.4)
-    assert(stats.predict < 0.5)
+    assert(stats.predict === 1)
+    assert(stats.prob === 0.6)
     assert(stats.impurity > 0.2)
   }
 
@@ -265,7 +450,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val strategy = new Strategy(
       Regression,
       Variance,
-      maxDepth = 3,
+      maxDepth = 2,
       maxBins = 100,
       categoricalFeaturesInfo = Map(0 -> 3, 1-> 3))
     val (splits, bins) = DecisionTree.findSplitsBins(rdd,strategy)
@@ -280,16 +465,32 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
     val stats = bestSplits(0)._2
     assert(stats.gain > 0)
-    assert(stats.predict > 0.4)
-    assert(stats.predict < 0.5)
+    assert(stats.predict === 0.6)
     assert(stats.impurity > 0.2)
   }
 
+  test("regression stump with categorical variables of arity 2") {
+    val arr = DecisionTreeSuite.generateCategoricalDataPoints()
+    assert(arr.length === 1000)
+    val rdd = sc.parallelize(arr)
+    val strategy = new Strategy(
+      Regression,
+      Variance,
+      maxDepth = 2,
+      maxBins = 100,
+      categoricalFeaturesInfo = Map(0 -> 2, 1-> 2))
+
+    val model = DecisionTree.train(rdd, strategy)
+    validateRegressor(model, arr, 0.0)
+    assert(model.numNodes === 3)
+    assert(model.depth === 1)
+  }
+
   test("stump with fixed label 0 for Gini") {
     val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel0()
     assert(arr.length === 1000)
     val rdd = sc.parallelize(arr)
-    val strategy = new Strategy(Classification, Gini, 3, 100)
+    val strategy = new Strategy(Classification, Gini, 3, 2, 100)
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     assert(splits.length === 2)
     assert(splits(0).length === 99)
@@ -302,7 +503,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
       Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
-    assert(bestSplits(0)._1.threshold === 10)
     assert(bestSplits(0)._2.gain === 0)
     assert(bestSplits(0)._2.leftImpurity === 0)
     assert(bestSplits(0)._2.rightImpurity === 0)
@@ -312,7 +512,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel1()
     assert(arr.length === 1000)
     val rdd = sc.parallelize(arr)
-    val strategy = new Strategy(Classification, Gini, 3, 100)
+    val strategy = new Strategy(Classification, Gini, 3, 2, 100)
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     assert(splits.length === 2)
     assert(splits(0).length === 99)
@@ -325,7 +525,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
       Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
-    assert(bestSplits(0)._1.threshold === 10)
     assert(bestSplits(0)._2.gain === 0)
     assert(bestSplits(0)._2.leftImpurity === 0)
     assert(bestSplits(0)._2.rightImpurity === 0)
@@ -336,7 +535,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel0()
     assert(arr.length === 1000)
     val rdd = sc.parallelize(arr)
-    val strategy = new Strategy(Classification, Entropy, 3, 100)
+    val strategy = new Strategy(Classification, Entropy, 3, 2, 100)
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     assert(splits.length === 2)
     assert(splits(0).length === 99)
@@ -349,7 +548,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
       Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
-    assert(bestSplits(0)._1.threshold === 10)
     assert(bestSplits(0)._2.gain === 0)
     assert(bestSplits(0)._2.leftImpurity === 0)
     assert(bestSplits(0)._2.rightImpurity === 0)
@@ -360,7 +558,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
     val arr = DecisionTreeSuite.generateOrderedLabeledPointsWithLabel1()
     assert(arr.length === 1000)
     val rdd = sc.parallelize(arr)
-    val strategy = new Strategy(Classification, Entropy, 3, 100)
+    val strategy = new Strategy(Classification, Entropy, 3, 2, 100)
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     assert(splits.length === 2)
     assert(splits(0).length === 99)
@@ -373,18 +571,17 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
       Array[List[Filter]](), splits, bins, 10)
     assert(bestSplits.length === 1)
     assert(bestSplits(0)._1.feature === 0)
-    assert(bestSplits(0)._1.threshold === 10)
     assert(bestSplits(0)._2.gain === 0)
     assert(bestSplits(0)._2.leftImpurity === 0)
     assert(bestSplits(0)._2.rightImpurity === 0)
     assert(bestSplits(0)._2.predict === 1)
   }
 
-  test("test second level node building with/without groups") {
+  test("second level node building with/without groups") {
     val arr = DecisionTreeSuite.generateOrderedLabeledPoints()
     assert(arr.length === 1000)
     val rdd = sc.parallelize(arr)
-    val strategy = new Strategy(Classification, Entropy, 3, 100)
+    val strategy = new Strategy(Classification, Entropy, 3, 2, 100)
     val (splits, bins) = DecisionTree.findSplitsBins(rdd, strategy)
     assert(splits.length === 2)
     assert(splits(0).length === 99)
@@ -426,6 +623,152 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
 
   }
 
+  test("stump with categorical variables for multiclass classification") {
+    val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlass()
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
+    assert(strategy.isMulticlassClassification)
+    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
+      Array[List[Filter]](), splits, bins, 10)
+
+    assert(bestSplits.length === 1)
+    val bestSplit = bestSplits(0)._1
+    assert(bestSplit.feature === 0)
+    assert(bestSplit.categories.length === 1)
+    assert(bestSplit.categories.contains(1))
+    assert(bestSplit.featureType === Categorical)
+  }
+
+  test("stump with 1 continuous variable for binary classification, to check off-by-1 error") {
+    val arr = new Array[LabeledPoint](4)
+    arr(0) = new LabeledPoint(0.0, Vectors.dense(0.0))
+    arr(1) = new LabeledPoint(1.0, Vectors.dense(1.0))
+    arr(2) = new LabeledPoint(1.0, Vectors.dense(2.0))
+    arr(3) = new LabeledPoint(1.0, Vectors.dense(3.0))
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 2)
+
+    val model = DecisionTree.train(input, strategy)
+    validateClassifier(model, arr, 1.0)
+    assert(model.numNodes === 3)
+    assert(model.depth === 1)
+  }
+
+  test("stump with 2 continuous variables for binary classification") {
+    val arr = new Array[LabeledPoint](4)
+    arr(0) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
+    arr(1) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 1.0))))
+    arr(2) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
+    arr(3) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 2.0))))
+
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 2)
+
+    val model = DecisionTree.train(input, strategy)
+    validateClassifier(model, arr, 1.0)
+    assert(model.numNodes === 3)
+    assert(model.depth === 1)
+    assert(model.topNode.split.get.feature === 1)
+  }
+
+  test("stump with categorical variables for multiclass classification, with just enough bins") {
+    val maxBins = math.pow(2, 3 - 1).toInt // just enough bins to allow unordered features
+    val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlass()
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
+    assert(strategy.isMulticlassClassification)
+
+    val model = DecisionTree.train(input, strategy)
+    validateClassifier(model, arr, 1.0)
+    assert(model.numNodes === 3)
+    assert(model.depth === 1)
+
+    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
+      Array[List[Filter]](), splits, bins, 10)
+
+    assert(bestSplits.length === 1)
+    val bestSplit = bestSplits(0)._1
+    assert(bestSplit.feature === 0)
+    assert(bestSplit.categories.length === 1)
+    assert(bestSplit.categories.contains(1))
+    assert(bestSplit.featureType === Categorical)
+    val gain = bestSplits(0)._2
+    assert(gain.leftImpurity === 0)
+    assert(gain.rightImpurity === 0)
+  }
+
+  test("stump with continuous variables for multiclass classification") {
+    val arr = DecisionTreeSuite.generateContinuousDataPointsForMulticlass()
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 3)
+    assert(strategy.isMulticlassClassification)
+
+    val model = DecisionTree.train(input, strategy)
+    validateClassifier(model, arr, 0.9)
+
+    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
+      Array[List[Filter]](), splits, bins, 10)
+
+    assert(bestSplits.length === 1)
+    val bestSplit = bestSplits(0)._1
+
+    assert(bestSplit.feature === 1)
+    assert(bestSplit.featureType === Continuous)
+    assert(bestSplit.threshold > 1980)
+    assert(bestSplit.threshold < 2020)
+
+  }
+
+  test("stump with continuous + categorical variables for multiclass classification") {
+    val arr = DecisionTreeSuite.generateContinuousDataPointsForMulticlass()
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 3))
+    assert(strategy.isMulticlassClassification)
+
+    val model = DecisionTree.train(input, strategy)
+    validateClassifier(model, arr, 0.9)
+
+    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
+      Array[List[Filter]](), splits, bins, 10)
+
+    assert(bestSplits.length === 1)
+    val bestSplit = bestSplits(0)._1
+
+    assert(bestSplit.feature === 1)
+    assert(bestSplit.featureType === Continuous)
+    assert(bestSplit.threshold > 1980)
+    assert(bestSplit.threshold < 2020)
+  }
+
+  test("stump with categorical variables for ordered multiclass classification") {
+    val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlassForOrderedFeatures()
+    val input = sc.parallelize(arr)
+    val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
+      numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 10, 1 -> 10))
+    assert(strategy.isMulticlassClassification)
+    val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+    val bestSplits = DecisionTree.findBestSplits(input, new Array(31), strategy, 0,
+      Array[List[Filter]](), splits, bins, 10)
+
+    assert(bestSplits.length === 1)
+    val bestSplit = bestSplits(0)._1
+    assert(bestSplit.feature === 0)
+    assert(bestSplit.categories.length === 1)
+    assert(bestSplit.categories.contains(1.0))
+    assert(bestSplit.featureType === Categorical)
+  }
+
+
 }
 
 object DecisionTreeSuite {
@@ -473,4 +816,51 @@ object DecisionTreeSuite {
     }
     arr
   }
+
+  def generateCategoricalDataPointsAsJavaList(): java.util.List[LabeledPoint] = {
+    generateCategoricalDataPoints().toList.asJava
+  }
+
+  def generateCategoricalDataPointsForMulticlass(): Array[LabeledPoint] = {
+    val arr = new Array[LabeledPoint](3000)
+    for (i <- 0 until 3000) {
+      if (i < 1000) {
+        arr(i) = new LabeledPoint(2.0, Vectors.dense(2.0, 2.0))
+      } else if (i < 2000) {
+        arr(i) = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0))
+      } else {
+        arr(i) = new LabeledPoint(2.0, Vectors.dense(2.0, 2.0))
+      }
+    }
+    arr
+  }
+
+  def generateContinuousDataPointsForMulticlass(): Array[LabeledPoint] = {
+    val arr = new Array[LabeledPoint](3000)
+    for (i <- 0 until 3000) {
+      if (i < 2000) {
+        arr(i) = new LabeledPoint(2.0, Vectors.dense(2.0, i))
+      } else {
+        arr(i) = new LabeledPoint(1.0, Vectors.dense(2.0, i))
+      }
+    }
+    arr
+  }
+
+  def generateCategoricalDataPointsForMulticlassForOrderedFeatures():
+    Array[LabeledPoint] = {
+    val arr = new Array[LabeledPoint](3000)
+    for (i <- 0 until 3000) {
+      if (i < 1000) {
+        arr(i) = new LabeledPoint(2.0, Vectors.dense(2.0, 2.0))
+      } else if (i < 2000) {
+        arr(i) = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0))
+      } else {
+        arr(i) = new LabeledPoint(1.0, Vectors.dense(2.0, 2.0))
+      }
+    }
+    arr
+  }
+
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala
similarity index 56%
rename from mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
rename to mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala
index ac85677f2f014..5e9101cdd3804 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/LabelParsersSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala
@@ -17,25 +17,26 @@
 
 package org.apache.spark.mllib.util
 
-import org.scalatest.FunSuite
+import org.scalatest.{Suite, BeforeAndAfterAll}
 
-class LabelParsersSuite extends FunSuite {
-  test("binary label parser") {
-    for (parser <- Seq(BinaryLabelParser, BinaryLabelParser.getInstance())) {
-      assert(parser.parse("+1") === 1.0)
-      assert(parser.parse("1") === 1.0)
-      assert(parser.parse("0") === 0.0)
-      assert(parser.parse("-1") === 0.0)
-    }
+import org.apache.spark.{SparkConf, SparkContext}
+
+trait LocalClusterSparkContext extends BeforeAndAfterAll { self: Suite =>
+  @transient var sc: SparkContext = _
+
+  override def beforeAll() {
+    val conf = new SparkConf()
+      .setMaster("local-cluster[2, 1, 512]")
+      .setAppName("test-cluster")
+      .set("spark.akka.frameSize", "1") // set to 1MB to detect direct serialization of data
+    sc = new SparkContext(conf)
+    super.beforeAll()
   }
 
-  test("multiclass label parser") {
-    for (parser <- Seq(MulticlassLabelParser, MulticlassLabelParser.getInstance())) {
-      assert(parser.parse("0") == 0.0)
-      assert(parser.parse("+1") === 1.0)
-      assert(parser.parse("1") === 1.0)
-      assert(parser.parse("2") === 2.0)
-      assert(parser.parse("3") === 3.0)
+  override def afterAll() {
+    if (sc != null) {
+      sc.stop()
     }
+    super.afterAll()
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/LocalSparkContext.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/LocalSparkContext.scala
index 0d4868f3d9e42..7857d9e5ee5c4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/LocalSparkContext.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/LocalSparkContext.scala
@@ -20,13 +20,16 @@ package org.apache.spark.mllib.util
 import org.scalatest.Suite
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 
 trait LocalSparkContext extends BeforeAndAfterAll { self: Suite =>
   @transient var sc: SparkContext = _
 
   override def beforeAll() {
-    sc = new SparkContext("local", "test")
+    val conf = new SparkConf()
+      .setMaster("local")
+      .setAppName("test")
+    sc = new SparkContext(conf)
     super.beforeAll()
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index c14870fb969a8..8ef2bb1bf6a78 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -63,9 +63,9 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
   test("loadLibSVMFile") {
     val lines =
       """
-        |+1 1:1.0 3:2.0 5:3.0
-        |-1
-        |-1 2:4.0 4:5.0 6:6.0
+        |1 1:1.0 3:2.0 5:3.0
+        |0
+        |0 2:4.0 4:5.0 6:6.0
       """.stripMargin
     val tempDir = Files.createTempDir()
     tempDir.deleteOnExit()
@@ -73,7 +73,7 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
     Files.write(lines, file, Charsets.US_ASCII)
     val path = tempDir.toURI.toString
 
-    val pointsWithNumFeatures = loadLibSVMFile(sc, path, multiclass = false, 6).collect()
+    val pointsWithNumFeatures = loadLibSVMFile(sc, path, 6).collect()
     val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect()
 
     for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) {
@@ -86,11 +86,11 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext {
       assert(points(2).features === Vectors.sparse(6, Seq((1, 4.0), (3, 5.0), (5, 6.0))))
     }
 
-    val multiclassPoints = loadLibSVMFile(sc, path, multiclass = true).collect()
+    val multiclassPoints = loadLibSVMFile(sc, path).collect()
     assert(multiclassPoints.length === 3)
     assert(multiclassPoints(0).label === 1.0)
-    assert(multiclassPoints(1).label === -1.0)
-    assert(multiclassPoints(2).label === -1.0)
+    assert(multiclassPoints(1).label === 0.0)
+    assert(multiclassPoints(2).label === 0.0)
 
     Utils.deleteRecursively(tempDir)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
new file mode 100644
index 0000000000000..29cc42d8cbea7
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.util
+
+import org.apache.spark.mllib.linalg.Vector
+import org.scalatest.exceptions.TestFailedException
+
+object TestingUtils {
+
+  val ABS_TOL_MSG = " using absolute tolerance"
+  val REL_TOL_MSG = " using relative tolerance"
+
+  /**
+   * Private helper function for comparing two values using relative tolerance.
+   * Note that if x or y is extremely close to zero, i.e., smaller than Double.MinPositiveValue,
+   * the relative tolerance is meaningless, so the exception will be raised to warn users.
+   */
+  private def RelativeErrorComparison(x: Double, y: Double, eps: Double): Boolean = {
+    val absX = math.abs(x)
+    val absY = math.abs(y)
+    val diff = math.abs(x - y)
+    if (x == y) {
+      true
+    } else if (absX < Double.MinPositiveValue || absY < Double.MinPositiveValue) {
+      throw new TestFailedException(
+        s"$x or $y is extremely close to zero, so the relative tolerance is meaningless.", 0)
+    } else {
+      diff < eps * math.min(absX, absY)
+    }
+  }
+
+  /**
+   * Private helper function for comparing two values using absolute tolerance.
+   */
+  private def AbsoluteErrorComparison(x: Double, y: Double, eps: Double): Boolean = {
+    math.abs(x - y) < eps
+  }
+
+  case class CompareDoubleRightSide(
+    fun: (Double, Double, Double) => Boolean, y: Double, eps: Double, method: String)
+
+  /**
+   * Implicit class for comparing two double values using relative tolerance or absolute tolerance.
+   */
+  implicit class DoubleWithAlmostEquals(val x: Double) {
+
+    /**
+     * When the difference of two values are within eps, returns true; otherwise, returns false.
+     */
+    def ~=(r: CompareDoubleRightSide): Boolean = r.fun(x, r.y, r.eps)
+
+    /**
+     * When the difference of two values are within eps, returns false; otherwise, returns true.
+     */
+    def !~=(r: CompareDoubleRightSide): Boolean = !r.fun(x, r.y, r.eps)
+
+    /**
+     * Throws exception when the difference of two values are NOT within eps;
+     * otherwise, returns true.
+     */
+    def ~==(r: CompareDoubleRightSide): Boolean = {
+      if (!r.fun(x, r.y, r.eps)) {
+        throw new TestFailedException(
+          s"Expected $x and ${r.y} to be within ${r.eps}${r.method}.", 0)
+      }
+      true
+    }
+
+    /**
+     * Throws exception when the difference of two values are within eps; otherwise, returns true.
+     */
+    def !~==(r: CompareDoubleRightSide): Boolean = {
+      if (r.fun(x, r.y, r.eps)) {
+        throw new TestFailedException(
+          s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method}.", 0)
+      }
+      true
+    }
+
+    /**
+     * Comparison using absolute tolerance.
+     */
+    def absTol(eps: Double): CompareDoubleRightSide = CompareDoubleRightSide(AbsoluteErrorComparison,
+      x, eps, ABS_TOL_MSG)
+
+    /**
+     * Comparison using relative tolerance.
+     */
+    def relTol(eps: Double): CompareDoubleRightSide = CompareDoubleRightSide(RelativeErrorComparison,
+      x, eps, REL_TOL_MSG)
+
+    override def toString = x.toString
+  }
+
+  case class CompareVectorRightSide(
+    fun: (Vector, Vector, Double) => Boolean, y: Vector, eps: Double, method: String)
+
+  /**
+   * Implicit class for comparing two vectors using relative tolerance or absolute tolerance.
+   */
+  implicit class VectorWithAlmostEquals(val x: Vector) {
+
+    /**
+     * When the difference of two vectors are within eps, returns true; otherwise, returns false.
+     */
+    def ~=(r: CompareVectorRightSide): Boolean = r.fun(x, r.y, r.eps)
+
+    /**
+     * When the difference of two vectors are within eps, returns false; otherwise, returns true.
+     */
+    def !~=(r: CompareVectorRightSide): Boolean = !r.fun(x, r.y, r.eps)
+
+    /**
+     * Throws exception when the difference of two vectors are NOT within eps;
+     * otherwise, returns true.
+     */
+    def ~==(r: CompareVectorRightSide): Boolean = {
+      if (!r.fun(x, r.y, r.eps)) {
+        throw new TestFailedException(
+          s"Expected $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0)
+      }
+      true
+    }
+
+    /**
+     * Throws exception when the difference of two vectors are within eps; otherwise, returns true.
+     */
+    def !~==(r: CompareVectorRightSide): Boolean = {
+      if (r.fun(x, r.y, r.eps)) {
+        throw new TestFailedException(
+          s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0)
+      }
+      true
+    }
+
+    /**
+     * Comparison using absolute tolerance.
+     */
+    def absTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide(
+      (x: Vector, y: Vector, eps: Double) => {
+        x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps)
+      }, x, eps, ABS_TOL_MSG)
+
+    /**
+     * Comparison using relative tolerance. Note that comparing against sparse vector
+     * with elements having value of zero will raise exception because it involves with
+     * comparing against zero.
+     */
+    def relTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide(
+      (x: Vector, y: Vector, eps: Double) => {
+        x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps)
+      }, x, eps, REL_TOL_MSG)
+
+    override def toString = x.toString
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
new file mode 100644
index 0000000000000..b0ecb33c28483
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.util
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.scalatest.FunSuite
+import org.apache.spark.mllib.util.TestingUtils._
+import org.scalatest.exceptions.TestFailedException
+
+class TestingUtilsSuite extends FunSuite {
+
+  test("Comparing doubles using relative error.") {
+
+    assert(23.1 ~== 23.52 relTol 0.02)
+    assert(23.1 ~== 22.74 relTol 0.02)
+    assert(23.1 ~= 23.52 relTol 0.02)
+    assert(23.1 ~= 22.74 relTol 0.02)
+    assert(!(23.1 !~= 23.52 relTol 0.02))
+    assert(!(23.1 !~= 22.74 relTol 0.02))
+
+    // Should throw exception with message when test fails.
+    intercept[TestFailedException](23.1 !~== 23.52 relTol 0.02)
+    intercept[TestFailedException](23.1 !~== 22.74 relTol 0.02)
+    intercept[TestFailedException](23.1 ~== 23.63 relTol 0.02)
+    intercept[TestFailedException](23.1 ~== 22.34 relTol 0.02)
+
+    assert(23.1 !~== 23.63 relTol 0.02)
+    assert(23.1 !~== 22.34 relTol 0.02)
+    assert(23.1 !~= 23.63 relTol 0.02)
+    assert(23.1 !~= 22.34 relTol 0.02)
+    assert(!(23.1 ~= 23.63 relTol 0.02))
+    assert(!(23.1 ~= 22.34 relTol 0.02))
+
+    // Comparing against zero should fail the test and throw exception with message
+    // saying that the relative error is meaningless in this situation.
+    intercept[TestFailedException](0.1 ~== 0.0 relTol 0.032)
+    intercept[TestFailedException](0.1 ~= 0.0 relTol 0.032)
+    intercept[TestFailedException](0.1 !~== 0.0 relTol 0.032)
+    intercept[TestFailedException](0.1 !~= 0.0 relTol 0.032)
+    intercept[TestFailedException](0.0 ~== 0.1 relTol 0.032)
+    intercept[TestFailedException](0.0 ~= 0.1 relTol 0.032)
+    intercept[TestFailedException](0.0 !~== 0.1 relTol 0.032)
+    intercept[TestFailedException](0.0 !~= 0.1 relTol 0.032)
+
+    // Comparisons of numbers very close to zero.
+    assert(10 * Double.MinPositiveValue ~== 9.5 * Double.MinPositiveValue relTol 0.01)
+    assert(10 * Double.MinPositiveValue !~== 11 * Double.MinPositiveValue relTol 0.01)
+
+    assert(-Double.MinPositiveValue ~== 1.18 * -Double.MinPositiveValue relTol 0.012)
+    assert(-Double.MinPositiveValue ~== 1.38 * -Double.MinPositiveValue relTol 0.012)
+  }
+
+  test("Comparing doubles using absolute error.") {
+
+    assert(17.8 ~== 17.99 absTol 0.2)
+    assert(17.8 ~== 17.61 absTol 0.2)
+    assert(17.8 ~= 17.99 absTol 0.2)
+    assert(17.8 ~= 17.61 absTol 0.2)
+    assert(!(17.8 !~= 17.99 absTol 0.2))
+    assert(!(17.8 !~= 17.61 absTol 0.2))
+
+    // Should throw exception with message when test fails.
+    intercept[TestFailedException](17.8 !~== 17.99 absTol 0.2)
+    intercept[TestFailedException](17.8 !~== 17.61 absTol 0.2)
+    intercept[TestFailedException](17.8 ~== 18.01 absTol 0.2)
+    intercept[TestFailedException](17.8 ~== 17.59 absTol 0.2)
+
+    assert(17.8 !~== 18.01 absTol 0.2)
+    assert(17.8 !~== 17.59 absTol 0.2)
+    assert(17.8 !~= 18.01 absTol 0.2)
+    assert(17.8 !~= 17.59 absTol 0.2)
+    assert(!(17.8 ~= 18.01 absTol 0.2))
+    assert(!(17.8 ~= 17.59 absTol 0.2))
+
+    // Comparisons of numbers very close to zero, and both side of zeros
+    assert(Double.MinPositiveValue ~== 4 * Double.MinPositiveValue absTol 5 * Double.MinPositiveValue)
+    assert(Double.MinPositiveValue !~== 6 * Double.MinPositiveValue absTol 5 * Double.MinPositiveValue)
+
+    assert(-Double.MinPositiveValue ~== 3 * Double.MinPositiveValue absTol 5 * Double.MinPositiveValue)
+    assert(Double.MinPositiveValue !~== -4 * Double.MinPositiveValue absTol 5 * Double.MinPositiveValue)
+  }
+
+  test("Comparing vectors using relative error.") {
+
+    //Comparisons of two dense vectors
+    assert(Vectors.dense(Array(3.1, 3.5)) ~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array(3.1, 3.5)) !~== Vectors.dense(Array(3.135, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array(3.1, 3.5)) ~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+    assert(Vectors.dense(Array(3.1, 3.5)) !~= Vectors.dense(Array(3.135, 3.534)) relTol 0.01)
+    assert(!(Vectors.dense(Array(3.1, 3.5)) !~= Vectors.dense(Array(3.130, 3.534)) relTol 0.01))
+    assert(!(Vectors.dense(Array(3.1, 3.5)) ~= Vectors.dense(Array(3.135, 3.534)) relTol 0.01))
+
+    // Should throw exception with message when test fails.
+    intercept[TestFailedException](
+      Vectors.dense(Array(3.1, 3.5)) !~== Vectors.dense(Array(3.130, 3.534)) relTol 0.01)
+
+    intercept[TestFailedException](
+      Vectors.dense(Array(3.1, 3.5)) ~== Vectors.dense(Array(3.135, 3.534)) relTol 0.01)
+
+    // Comparing against zero should fail the test and throw exception with message
+    // saying that the relative error is meaningless in this situation.
+    intercept[TestFailedException](
+      Vectors.dense(Array(3.1, 0.01)) ~== Vectors.dense(Array(3.13, 0.0)) relTol 0.01)
+
+    intercept[TestFailedException](
+      Vectors.dense(Array(3.1, 0.01)) ~== Vectors.sparse(2, Array(0), Array(3.13)) relTol 0.01)
+
+    // Comparisons of two sparse vectors
+    assert(Vectors.dense(Array(3.1, 3.5)) ~==
+      Vectors.sparse(2, Array(0, 1), Array(3.130, 3.534)) relTol 0.01)
+
+    assert(Vectors.dense(Array(3.1, 3.5)) !~==
+      Vectors.sparse(2, Array(0, 1), Array(3.135, 3.534)) relTol 0.01)
+  }
+
+  test("Comparing vectors using absolute error.") {
+
+    //Comparisons of two dense vectors
+    assert(Vectors.dense(Array(3.1, 3.5, 0.0)) ~==
+      Vectors.dense(Array(3.1 + 1E-8, 3.5 + 2E-7, 1E-8)) absTol 1E-6)
+
+    assert(Vectors.dense(Array(3.1, 3.5, 0.0)) !~==
+      Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7, 1 + 1E-3)) absTol 1E-6)
+
+    assert(Vectors.dense(Array(3.1, 3.5, 0.0)) ~=
+      Vectors.dense(Array(3.1 + 1E-8, 3.5 + 2E-7, 1E-8)) absTol 1E-6)
+
+    assert(Vectors.dense(Array(3.1, 3.5, 0.0)) !~=
+      Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7, 1 + 1E-3)) absTol 1E-6)
+
+    assert(!(Vectors.dense(Array(3.1, 3.5, 0.0)) !~=
+      Vectors.dense(Array(3.1 + 1E-8, 3.5 + 2E-7, 1E-8)) absTol 1E-6))
+
+    assert(!(Vectors.dense(Array(3.1, 3.5, 0.0)) ~=
+      Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7, 1 + 1E-3)) absTol 1E-6))
+
+    // Should throw exception with message when test fails.
+    intercept[TestFailedException](Vectors.dense(Array(3.1, 3.5, 0.0)) !~==
+      Vectors.dense(Array(3.1 + 1E-8, 3.5 + 2E-7, 1E-8)) absTol 1E-6)
+
+    intercept[TestFailedException](Vectors.dense(Array(3.1, 3.5, 0.0)) ~==
+      Vectors.dense(Array(3.1 + 1E-5, 3.5 + 2E-7, 1 + 1E-3)) absTol 1E-6)
+
+    // Comparisons of two sparse vectors
+    assert(Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) ~==
+      Vectors.sparse(3, Array(0, 2), Array(3.1 + 1E-8, 2.4 + 1E-7)) absTol 1E-6)
+
+    assert(Vectors.sparse(3, Array(0, 2), Array(3.1 + 1E-8, 2.4 + 1E-7)) ~==
+      Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) absTol 1E-6)
+
+    assert(Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) !~==
+      Vectors.sparse(3, Array(0, 2), Array(3.1 + 1E-3, 2.4)) absTol 1E-6)
+
+    assert(Vectors.sparse(3, Array(0, 2), Array(3.1 + 1E-3, 2.4)) !~==
+      Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) absTol 1E-6)
+
+    // Comparisons of a dense vector and a sparse vector
+    assert(Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) ~==
+      Vectors.dense(Array(3.1 + 1E-8, 0, 2.4 + 1E-7)) absTol 1E-6)
+
+    assert(Vectors.dense(Array(3.1 + 1E-8, 0, 2.4 + 1E-7)) ~==
+      Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) absTol 1E-6)
+
+    assert(Vectors.sparse(3, Array(0, 2), Array(3.1, 2.4)) !~==
+      Vectors.dense(Array(3.1, 1E-3, 2.4)) absTol 1E-6)
+  }
+}
diff --git a/pom.xml b/pom.xml
index 05f76d566e9d1..4ab027bad55c0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -100,6 +100,7 @@
     <module>external/twitter</module>
     <module>external/kafka</module>
     <module>external/flume</module>
+    <module>external/flume-sink</module>
     <module>external/zeromq</module>
     <module>external/mqtt</module>
     <module>examples</module>
@@ -110,9 +111,10 @@
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 
     <java.version>1.6</java.version>
-
+    <sbt.project.name>spark</sbt.project.name>
     <scala.version>2.10.4</scala.version>
     <scala.binary.version>2.10</scala.binary.version>
+    <scala.macros.version>2.0.1</scala.macros.version>
     <mesos.version>0.18.1</mesos.version>
     <mesos.classifier>shaded-protobuf</mesos.classifier>
     <akka.group>org.spark-project.akka</akka.group>
@@ -132,6 +134,8 @@
     <codahale.metrics.version>3.0.0</codahale.metrics.version>
     <avro.version>1.7.6</avro.version>
     <jets3t.version>0.7.1</jets3t.version>
+    <aws.java.sdk.version>1.8.3</aws.java.sdk.version>
+    <aws.kinesis.client.version>1.1.0</aws.kinesis.client.version>
 
     <PermGen>64m</PermGen>
     <MaxPermGen>512m</MaxPermGen>
@@ -252,9 +256,15 @@
         <version>3.3.2</version>
       </dependency>
       <dependency>
-	  <groupId>commons-codec</groupId>
-	    <artifactId>commons-codec</artifactId>
-	    <version>1.5</version>
+        <groupId>commons-codec</groupId>
+        <artifactId>commons-codec</artifactId>
+        <version>1.5</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.commons</groupId>
+        <artifactId>commons-math3</artifactId>
+        <version>3.3</version>
+        <scope>test</scope>
       </dependency>
       <dependency>
         <groupId>com.google.code.findbugs</groupId>
@@ -297,6 +307,11 @@
         <artifactId>snappy-java</artifactId>
         <version>1.0.5</version>
       </dependency>
+      <dependency>
+        <groupId>net.jpountz.lz4</groupId>
+        <artifactId>lz4</artifactId>
+        <version>1.2.0</version>
+      </dependency>
       <dependency>
         <groupId>com.clearspring.analytics</groupId>
         <artifactId>stream</artifactId>
@@ -535,6 +550,10 @@
             <groupId>org.mortbay.jetty</groupId>
             <artifactId>servlet-api-2.5</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>javax.servlet</groupId>
+            <artifactId>servlet-api</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
@@ -605,7 +624,6 @@
         <groupId>net.java.dev.jets3t</groupId>
         <artifactId>jets3t</artifactId>
         <version>${jets3t.version}</version>
-        <scope>runtime</scope>
         <exclusions>
           <exclusion>
             <groupId>commons-logging</groupId>
@@ -618,6 +636,10 @@
         <artifactId>hadoop-yarn-api</artifactId>
         <version>${yarn.version}</version>
         <exclusions>
+          <exclusion>
+            <groupId>javax.servlet</groupId>
+            <artifactId>servlet-api</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>asm</groupId>
             <artifactId>asm</artifactId>
@@ -760,7 +782,7 @@
         <plugin>
           <groupId>net.alchim31.maven</groupId>
           <artifactId>scala-maven-plugin</artifactId>
-          <version>3.1.6</version>
+          <version>3.2.0</version>
           <executions>
             <execution>
               <id>scala-compile-first</id>
@@ -806,6 +828,15 @@
               <javacArg>-target</javacArg>
               <javacArg>${java.version}</javacArg>
             </javacArgs>
+            <!-- The following plugin is required to use quasiquotes in Scala 2.10 and is used
+                 by Spark SQL for code generation. -->
+            <compilerPlugins>
+              <compilerPlugin>
+                  <groupId>org.scalamacros</groupId>
+                  <artifactId>paradise_${scala.version}</artifactId>
+                  <version>${scala.macros.version}</version>
+              </compilerPlugin>
+            </compilerPlugins>
           </configuration>
         </plugin>
         <plugin>
@@ -839,10 +870,11 @@
             <filereports>${project.build.directory}/SparkTestSuite.txt</filereports>
             <argLine>-Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
             <stderr/>
-            <environmentVariables>
-              <SPARK_HOME>${session.executionRootDirectory}</SPARK_HOME>
-              <SPARK_TESTING>1</SPARK_TESTING>
-            </environmentVariables>
+            <systemProperties>
+              <java.awt.headless>true</java.awt.headless>
+              <spark.test.home>${session.executionRootDirectory}</spark.test.home>
+              <spark.testing>1</spark.testing>
+            </systemProperties>
           </configuration>
           <executions>
             <execution>
@@ -945,6 +977,30 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-source-plugin</artifactId>
       </plugin>
+      <plugin>
+        <groupId>org.scalastyle</groupId>
+        <artifactId>scalastyle-maven-plugin</artifactId>
+        <version>0.4.0</version>
+        <configuration>
+          <verbose>false</verbose>
+          <failOnViolation>true</failOnViolation>
+          <includeTestSourceDirectory>false</includeTestSourceDirectory>
+          <failOnWarning>false</failOnWarning>
+          <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
+          <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
+          <configLocation>scalastyle-config.xml</configLocation>
+          <outputFile>scalastyle-output.xml</outputFile>
+          <outputEncoding>UTF-8</outputEncoding>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>check</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
 
@@ -958,6 +1014,14 @@
       </modules>
     </profile>
 
+    <!-- Kinesis integration is not included by default due to ASL-licensed code -->
+    <profile>
+      <id>kinesis-asl</id>
+      <modules>
+        <module>extras/kinesis-asl</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>java8-tests</id>
       <build>
@@ -1103,5 +1167,15 @@
       </dependencies>
     </profile>
 
+    <profile>
+      <id>hive-thriftserver</id>
+      <activation>
+        <activeByDefault>false</activeByDefault>
+      </activation>
+      <modules>
+        <module>sql/hive-thriftserver</module>
+      </modules>
+    </profile>
+
   </profiles>
 </project>
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index bb2d73741c3bf..034ba6a7bf50f 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -15,13 +15,16 @@
  * limitations under the License.
  */
 
+import sbt._
+import sbt.Keys.version
+
 import com.typesafe.tools.mima.core._
 import com.typesafe.tools.mima.core.MissingClassProblem
 import com.typesafe.tools.mima.core.MissingTypesProblem
 import com.typesafe.tools.mima.core.ProblemFilters._
 import com.typesafe.tools.mima.plugin.MimaKeys.{binaryIssueFilters, previousArtifact}
 import com.typesafe.tools.mima.plugin.MimaPlugin.mimaDefaultSettings
-import sbt._
+
 
 object MimaBuild {
 
@@ -53,7 +56,7 @@ object MimaBuild {
     excludePackage("org.apache.spark." + packageName)
   }
 
-  def ignoredABIProblems(base: File) = {
+  def ignoredABIProblems(base: File, currentSparkVersion: String) = {
 
     // Excludes placed here will be used for all Spark versions
     val defaultExcludes = Seq()
@@ -77,11 +80,16 @@ object MimaBuild {
     }
 
     defaultExcludes ++ ignoredClasses.flatMap(excludeClass) ++
-    ignoredMembers.flatMap(excludeMember) ++ MimaExcludes.excludes
+    ignoredMembers.flatMap(excludeMember) ++ MimaExcludes.excludes(currentSparkVersion)
+  }
+
+  def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
+    val organization = "org.apache.spark"
+    val previousSparkVersion = "1.0.0"
+    val fullId = "spark-" + projectRef.project + "_2.10"
+    mimaDefaultSettings ++ 
+    Seq(previousArtifact := Some(organization % fullId % previousSparkVersion),
+      binaryIssueFilters ++= ignoredABIProblems(sparkHome, version.value))
   }
 
-  def mimaSettings(sparkHome: File) = mimaDefaultSettings ++ Seq(
-    previousArtifact := None,
-    binaryIssueFilters ++= ignoredABIProblems(sparkHome)
-  )
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 1621833e124f5..537ca0dcf267d 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -31,8 +31,8 @@ import com.typesafe.tools.mima.core._
  * MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap")
  */
 object MimaExcludes {
-    val excludes =
-      SparkBuild.SPARK_VERSION match {
+    def excludes(version: String) =
+      version match {
         case v if v.startsWith("1.1") =>
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
@@ -59,10 +59,24 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.MemoryStore.Entry"),
+              "org.apache.spark.storage.MemoryStore.Entry")
+          ) ++
+          Seq(
+            // Renamed putValues -> putArray + putIterator
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.storage.MemoryStore.putValues"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.rdd.PairRDDFunctions.org$apache$spark$rdd$PairRDDFunctions$$"
-                + "createZero$1")
+              "org.apache.spark.storage.DiskStore.putValues"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.storage.TachyonStore.putValues")
+          ) ++
+          Seq(
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.streaming.flume.FlumeReceiver.this"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.streaming.kafka.KafkaReceiver.this")
           ) ++
           Seq( // Ignore some private methods in ALS.
             ProblemFilters.exclude[MissingMethodProblem](
@@ -70,14 +84,33 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
               "org.apache.spark.mllib.recommendation.ALS.this"),
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7")
+              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
           ) ++
+          MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
           MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
           MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
           MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
           MimaBuild.excludeSparkClass("storage.Values") ++
           MimaBuild.excludeSparkClass("storage.Entry") ++
-          MimaBuild.excludeSparkClass("storage.MemoryStore$Entry")
+          MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
+          Seq(
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.mllib.tree.impurity.Gini.calculate"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
+            ProblemFilters.exclude[IncompatibleMethTypeProblem](
+              "org.apache.spark.mllib.tree.impurity.Variance.calculate")
+          ) ++
+          Seq ( // Package-private classes removed in SPARK-2341
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
+          )
         case v if v.startsWith("1.0") =>
           Seq(
             MimaBuild.excludeSparkPackage("api.java"),
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 599714233c18f..aac621fe53938 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -15,498 +15,193 @@
  * limitations under the License.
  */
 
+import scala.util.Properties
+import scala.collection.JavaConversions._
+
 import sbt._
 import sbt.Classpaths.publishTask
 import sbt.Keys._
-import sbtassembly.Plugin._
-import AssemblyKeys._
-import scala.util.Properties
+import sbtunidoc.Plugin.genjavadocSettings
 import org.scalastyle.sbt.ScalastylePlugin.{Settings => ScalaStyleSettings}
-import com.typesafe.tools.mima.plugin.MimaKeys.previousArtifact
-import sbtunidoc.Plugin._
-import UnidocKeys._
-
-import scala.collection.JavaConversions._
-
-// For Sonatype publishing
-// import com.jsuereth.pgp.sbtplugin.PgpKeys._
-
-object SparkBuild extends Build {
-  val SPARK_VERSION = "1.1.0-SNAPSHOT"
-  val SPARK_VERSION_SHORT = SPARK_VERSION.replaceAll("-SNAPSHOT", "")
+import com.typesafe.sbt.pom.{PomBuild, SbtPomKeys}
+import net.virtualvoid.sbt.graph.Plugin.graphSettings
 
-  // Hadoop version to build against. For example, "1.0.4" for Apache releases, or
-  // "2.0.0-mr1-cdh4.2.0" for Cloudera Hadoop. Note that these variables can be set
-  // through the environment variables SPARK_HADOOP_VERSION and SPARK_YARN.
-  val DEFAULT_HADOOP_VERSION = "1.0.4"
+object BuildCommons {
 
-  // Whether the Hadoop version to build against is 2.2.x, or a variant of it. This can be set
-  // through the SPARK_IS_NEW_HADOOP environment variable.
-  val DEFAULT_IS_NEW_HADOOP = false
+  private val buildLocation = file(".").getAbsoluteFile.getParentFile
 
-  val DEFAULT_YARN = false
+  val allProjects@Seq(bagel, catalyst, core, graphx, hive, hiveThriftServer, mllib, repl, spark,
+  sql, streaming, streamingFlumeSink, streamingFlume, streamingKafka, streamingMqtt,
+  streamingTwitter, streamingZeromq) =
+    Seq("bagel", "catalyst", "core", "graphx", "hive", "hive-thriftserver", "mllib", "repl",
+      "spark", "sql", "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka",
+      "streaming-mqtt", "streaming-twitter", "streaming-zeromq").map(ProjectRef(buildLocation, _))
 
-  val DEFAULT_HIVE = false
+  val optionallyEnabledProjects@Seq(yarn, yarnStable, yarnAlpha, java8Tests, sparkGangliaLgpl, sparkKinesisAsl) =
+    Seq("yarn", "yarn-stable", "yarn-alpha", "java8-tests", "ganglia-lgpl", "kinesis-asl")
+      .map(ProjectRef(buildLocation, _))
 
-  // HBase version; set as appropriate.
-  val HBASE_VERSION = "0.94.6"
-
-  // Target JVM version
-  val SCALAC_JVM_VERSION = "jvm-1.6"
-  val JAVAC_JVM_VERSION = "1.6"
-
-  lazy val root = Project("root", file("."), settings = rootSettings) aggregate(allProjects: _*)
-
-  lazy val core = Project("core", file("core"), settings = coreSettings)
-
-  /** Following project only exists to pull previous artifacts of Spark for generating
-    Mima ignores. For more information see: SPARK 2071 */
-  lazy val oldDeps = Project("oldDeps", file("dev"), settings = oldDepsSettings)
+  val assemblyProjects@Seq(assembly, examples) = Seq("assembly", "examples")
+    .map(ProjectRef(buildLocation, _))
 
-  def replDependencies = Seq[ProjectReference](core, graphx, bagel, mllib, sql) ++ maybeHiveRef
+  val tools = "tools"
 
-  lazy val repl = Project("repl", file("repl"), settings = replSettings)
-    .dependsOn(replDependencies.map(a => a: sbt.ClasspathDep[sbt.ProjectReference]): _*)
-
-  lazy val tools = Project("tools", file("tools"), settings = toolsSettings) dependsOn(core) dependsOn(streaming)
-
-  lazy val bagel = Project("bagel", file("bagel"), settings = bagelSettings) dependsOn(core)
-
-  lazy val graphx = Project("graphx", file("graphx"), settings = graphxSettings) dependsOn(core)
-
-  lazy val catalyst = Project("catalyst", file("sql/catalyst"), settings = catalystSettings) dependsOn(core)
-
-  lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core) dependsOn(catalyst % "compile->compile;test->test")
-
-  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)
-
-  lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
-  lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()
-
-  lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)
+  val sparkHome = buildLocation
+}
 
-  lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
+object SparkBuild extends PomBuild {
 
-  lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
-    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
+  import BuildCommons._
+  import scala.collection.mutable.Map
 
-  lazy val assembleDepsTask = TaskKey[Unit]("assemble-deps")
-  lazy val assembleDeps = assembleDepsTask := {
-    println()
-    println("**** NOTE ****")
-    println("'sbt/sbt assemble-deps' is no longer supported.")
-    println("Instead create a normal assembly and:")
-    println("  export SPARK_PREPEND_CLASSES=1 (toggle on)")
-    println("  unset SPARK_PREPEND_CLASSES (toggle off)")
-    println()
-  }
+  val projectsMap: Map[String, Seq[Setting[_]]] = Map.empty
 
-  // A configuration to set an alternative publishLocalConfiguration
-  lazy val MavenCompile = config("m2r") extend(Compile)
-  lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
-  val sparkHome = System.getProperty("user.dir")
-
-  // Allows build configuration to be set through environment variables
-  lazy val hadoopVersion = Properties.envOrElse("SPARK_HADOOP_VERSION", DEFAULT_HADOOP_VERSION)
-  lazy val isNewHadoop = Properties.envOrNone("SPARK_IS_NEW_HADOOP") match {
-    case None => {
-      val isNewHadoopVersion = "^2\\.[2-9]+".r.findFirstIn(hadoopVersion).isDefined
-      (isNewHadoopVersion|| DEFAULT_IS_NEW_HADOOP)
+  // Provides compatibility for older versions of the Spark build
+  def backwardCompatibility = {
+    import scala.collection.mutable
+    var isAlphaYarn = false
+    var profiles: mutable.Seq[String] = mutable.Seq.empty
+    if (Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined) {
+      println("NOTE: SPARK_GANGLIA_LGPL is deprecated, please use -Pspark-ganglia-lgpl flag.")
+      profiles ++= Seq("spark-ganglia-lgpl")
+    }
+    if (Properties.envOrNone("SPARK_HIVE").isDefined) {
+      println("NOTE: SPARK_HIVE is deprecated, please use -Phive flag.")
+      profiles ++= Seq("hive")
+    }
+    Properties.envOrNone("SPARK_HADOOP_VERSION") match {
+      case Some(v) =>
+        if (v.matches("0.23.*")) isAlphaYarn = true
+        println("NOTE: SPARK_HADOOP_VERSION is deprecated, please use -Dhadoop.version=" + v)
+        System.setProperty("hadoop.version", v)
+      case None =>
     }
-    case Some(v) => v.toBoolean
+    if (Properties.envOrNone("SPARK_YARN").isDefined) {
+      if(isAlphaYarn) {
+        println("NOTE: SPARK_YARN is deprecated, please use -Pyarn-alpha flag.")
+        profiles ++= Seq("yarn-alpha")
+      }
+      else {
+        println("NOTE: SPARK_YARN is deprecated, please use -Pyarn flag.")
+        profiles ++= Seq("yarn")
+      }
+    }
+    profiles
   }
 
-  lazy val isYarnEnabled = Properties.envOrNone("SPARK_YARN") match {
-    case None => DEFAULT_YARN
-    case Some(v) => v.toBoolean
+  override val profiles = Properties.envOrNone("SBT_MAVEN_PROFILES") match {
+    case None => backwardCompatibility
+    case Some(v) =>
+      if (backwardCompatibility.nonEmpty)
+        println("Note: We ignore environment variables, when use of profile is detected in " +
+          "conjunction with environment variable.")
+      v.split("(\\s+|,)").filterNot(_.isEmpty).map(_.trim.replaceAll("-P", "")).toSeq
   }
-  lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
-  val maybeAvro = if (hadoopVersion.startsWith("0.23.")) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()
 
-  lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
-    case None => DEFAULT_HIVE
-    case Some(v) => v.toBoolean
+  Properties.envOrNone("SBT_MAVEN_PROPERTIES") match {
+    case Some(v) =>
+      v.split("(\\s+|,)").filterNot(_.isEmpty).map(_.split("=")).foreach(x => System.setProperty(x(0), x(1)))
+    case _ =>
   }
 
-  // Include Ganglia integration if the user has enabled Ganglia
-  // This is isolated from the normal build due to LGPL-licensed code in the library
-  lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
-  lazy val gangliaProj = Project("spark-ganglia-lgpl", file("extras/spark-ganglia-lgpl"), settings = gangliaSettings).dependsOn(core)
-  val maybeGanglia: Seq[ClasspathDependency] = if (isGangliaEnabled) Seq(gangliaProj) else Seq()
-  val maybeGangliaRef: Seq[ProjectReference] = if (isGangliaEnabled) Seq(gangliaProj) else Seq()
-
-  // Include the Java 8 project if the JVM version is 8+
-  lazy val javaVersion = System.getProperty("java.specification.version")
-  lazy val isJava8Enabled = javaVersion.toDouble >= "1.8".toDouble
-  val maybeJava8Tests = if (isJava8Enabled) Seq[ProjectReference](java8Tests) else Seq[ProjectReference]()
-  lazy val java8Tests = Project("java8-tests", file("extras/java8-tests"), settings = java8TestsSettings).
-    dependsOn(core) dependsOn(streaming % "compile->compile;test->test")
-
-  // Include the YARN project if the user has enabled YARN
-  lazy val yarnAlpha = Project("yarn-alpha", file("yarn/alpha"), settings = yarnAlphaSettings) dependsOn(core)
-  lazy val yarn = Project("yarn", file("yarn/stable"), settings = yarnSettings) dependsOn(core)
-
-  lazy val maybeYarn: Seq[ClasspathDependency] = if (isYarnEnabled) Seq(if (isNewHadoop) yarn else yarnAlpha) else Seq()
-  lazy val maybeYarnRef: Seq[ProjectReference] = if (isYarnEnabled) Seq(if (isNewHadoop) yarn else yarnAlpha) else Seq()
-
-  lazy val externalTwitter = Project("external-twitter", file("external/twitter"), settings = twitterSettings)
-    .dependsOn(streaming % "compile->compile;test->test")
-
-  lazy val externalKafka = Project("external-kafka", file("external/kafka"), settings = kafkaSettings)
-    .dependsOn(streaming % "compile->compile;test->test")
-
-  lazy val externalFlume = Project("external-flume", file("external/flume"), settings = flumeSettings)
-    .dependsOn(streaming % "compile->compile;test->test")
-
-  lazy val externalZeromq = Project("external-zeromq", file("external/zeromq"), settings = zeromqSettings)
-    .dependsOn(streaming % "compile->compile;test->test")
+  override val userPropertiesMap = System.getProperties.toMap
 
-  lazy val externalMqtt = Project("external-mqtt", file("external/mqtt"), settings = mqttSettings)
-    .dependsOn(streaming % "compile->compile;test->test")
-
-  lazy val allExternal = Seq[ClasspathDependency](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
-  lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
-
-  lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
-    .dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)
-
-  // Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
-  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef
-
-  lazy val allProjects = packageProjects ++ allExternalRefs ++
-    Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests
+  lazy val MavenCompile = config("m2r") extend(Compile)
+  lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
 
-  def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
-    organization       := "org.apache.spark",
-    version            := SPARK_VERSION,
-    scalaVersion       := "2.10.4",
-    scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation", "-feature",
-      "-target:" + SCALAC_JVM_VERSION),
-    javacOptions := Seq("-target", JAVAC_JVM_VERSION, "-source", JAVAC_JVM_VERSION),
-    unmanagedJars in Compile <<= baseDirectory map { base => (base / "lib" ** "*.jar").classpath },
+  lazy val sharedSettings = graphSettings ++ ScalaStyleSettings ++ genjavadocSettings ++ Seq (
+    javaHome   := Properties.envOrNone("JAVA_HOME").map(file),
+    incOptions := incOptions.value.withNameHashing(true),
     retrieveManaged := true,
-    javaHome := Properties.envOrNone("JAVA_HOME").map(file),
-    // This is to add convenience of enabling sbt -Dsbt.offline=true for making the build offline.
-    offline := "true".equalsIgnoreCase(sys.props("sbt.offline")),
     retrievePattern := "[type]s/[artifact](-[revision])(-[classifier]).[ext]",
-    transitiveClassifiers in Scope.GlobalScope := Seq("sources"),
-    testListeners <<= target.map(t => Seq(new eu.henkelmann.sbt.JUnitXmlTestsListener(t.getAbsolutePath))),
-    incOptions := incOptions.value.withNameHashing(true),
-    // Fork new JVMs for tests and set Java options for those
-    fork := true,
-    javaOptions in Test += "-Dspark.home=" + sparkHome,
-    javaOptions in Test += "-Dspark.testing=1",
-    javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
-    javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark").map { case (k,v) => s"-D$k=$v" }.toSeq,
-    javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g".split(" ").toSeq,
-    javaOptions += "-Xmx3g",
-    // Show full stack trace and duration in test cases.
-    testOptions in Test += Tests.Argument("-oDF"),
-    // Remove certain packages from Scaladoc
-    scalacOptions in (Compile, doc) := Seq(
-      "-groups",
-      "-skip-packages", Seq(
-        "akka",
-        "org.apache.spark.api.python",
-        "org.apache.spark.network",
-        "org.apache.spark.deploy",
-        "org.apache.spark.util.collection"
-      ).mkString(":"),
-      "-doc-title", "Spark " + SPARK_VERSION_SHORT + " ScalaDoc"
-    ),
-
-    // Only allow one test at a time, even across projects, since they run in the same JVM
-    concurrentRestrictions in Global += Tags.limit(Tags.Test, 1),
-
-    resolvers ++= Seq(
-      // HTTPS is unavailable for Maven Central
-      "Maven Repository"     at "http://repo.maven.apache.org/maven2",
-      "Apache Repository"    at "https://repository.apache.org/content/repositories/releases",
-      "JBoss Repository"     at "https://repository.jboss.org/nexus/content/repositories/releases/",
-      "MQTT Repository"      at "https://repo.eclipse.org/content/repositories/paho-releases/",
-      "Cloudera Repository"  at "http://repository.cloudera.com/artifactory/cloudera-repos/",
-      "Pivotal Repository"   at "http://repo.spring.io/libs-release/",
-      // For Sonatype publishing
-      // "sonatype-snapshots"   at "https://oss.sonatype.org/content/repositories/snapshots",
-      // "sonatype-staging"     at "https://oss.sonatype.org/service/local/staging/deploy/maven2/",
-      // also check the local Maven repository ~/.m2
-      Resolver.mavenLocal
-    ),
-
     publishMavenStyle := true,
 
-    // useGpg in Global := true,
-
-    pomExtra := (
-      <parent>
-        <groupId>org.apache</groupId>
-        <artifactId>apache</artifactId>
-        <version>14</version>
-      </parent>
-      <url>http://spark.apache.org/</url>
-      <licenses>
-        <license>
-          <name>Apache 2.0 License</name>
-          <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
-          <distribution>repo</distribution>
-        </license>
-      </licenses>
-      <scm>
-        <connection>scm:git:git@github.com:apache/spark.git</connection>
-        <url>scm:git:git@github.com:apache/spark.git</url>
-      </scm>
-      <developers>
-        <developer>
-          <id>matei</id>
-          <name>Matei Zaharia</name>
-          <email>matei.zaharia@gmail.com</email>
-          <url>http://www.cs.berkeley.edu/~matei</url>
-          <organization>Apache Software Foundation</organization>
-          <organizationUrl>http://spark.apache.org</organizationUrl>
-        </developer>
-      </developers>
-      <issueManagement>
-        <system>JIRA</system>
-        <url>https://issues.apache.org/jira/browse/SPARK</url>
-      </issueManagement>
-    ),
-
-    /*
-    publishTo <<= version { (v: String) =>
-      val nexus = "https://oss.sonatype.org/"
-      if (v.trim.endsWith("SNAPSHOT"))
-        Some("sonatype-snapshots" at nexus + "content/repositories/snapshots")
-      else
-        Some("sonatype-staging"  at nexus + "service/local/staging/deploy/maven2")
-    },
-
-    */
-
-    libraryDependencies ++= Seq(
-        "io.netty"          % "netty-all"              % "4.0.17.Final",
-        "org.eclipse.jetty" % "jetty-server"           % jettyVersion,
-        "org.eclipse.jetty" % "jetty-util"             % jettyVersion,
-        "org.eclipse.jetty" % "jetty-plus"             % jettyVersion,
-        "org.eclipse.jetty" % "jetty-security"         % jettyVersion,
-        "org.scalatest"    %% "scalatest"              % "2.1.5"  % "test",
-        "org.scalacheck"   %% "scalacheck"             % "1.11.3" % "test",
-        "com.novocode"      % "junit-interface"        % "0.10"   % "test",
-        "org.easymock"      % "easymockclassextension" % "3.1"    % "test",
-        "org.mockito"       % "mockito-all"            % "1.9.0"  % "test",
-        "junit"             % "junit"                  % "4.10"   % "test",
-        // Needed by cglib which is needed by easymock.
-        "asm"               % "asm"                    % "3.3.1"  % "test"
-    ),
-
-    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
-    parallelExecution := true,
-    /* Workaround for issue #206 (fixed after SBT 0.11.0) */
-    watchTransitiveSources <<= Defaults.inDependencies[Task[Seq[File]]](watchSources.task,
-      const(std.TaskExtra.constant(Nil)), aggregate = true, includeRoot = true) apply { _.join.map(_.flatten) },
-
-    otherResolvers := Seq(Resolver.file("dotM2", file(Path.userHome + "/.m2/repository"))),
+    otherResolvers <<= SbtPomKeys.mvnLocalRepository(dotM2 => Seq(Resolver.file("dotM2", dotM2))),
     publishLocalConfiguration in MavenCompile <<= (packagedArtifacts, deliverLocal, ivyLoggingLevel) map {
       (arts, _, level) => new PublishConfiguration(None, "dotM2", arts, Seq(), level)
     },
     publishMavenStyle in MavenCompile := true,
     publishLocal in MavenCompile <<= publishTask(publishLocalConfiguration in MavenCompile, deliverLocal),
     publishLocalBoth <<= Seq(publishLocal in MavenCompile, publishLocal).dependOn
-  ) ++ net.virtualvoid.sbt.graph.Plugin.graphSettings ++ ScalaStyleSettings ++ genjavadocSettings
-
-  val akkaVersion = "2.2.3-shaded-protobuf"
-  val chillVersion = "0.3.6"
-  val codahaleMetricsVersion = "3.0.0"
-  val jblasVersion = "1.2.3"
-  val jets3tVersion = if ("^2\\.[3-9]+".r.findFirstIn(hadoopVersion).isDefined) "0.9.0" else "0.7.1"
-  val jettyVersion = "8.1.14.v20131031"
-  val hiveVersion = "0.12.0"
-  val parquetVersion = "1.4.3"
-  val slf4jVersion = "1.7.5"
-
-  val excludeJBossNetty = ExclusionRule(organization = "org.jboss.netty")
-  val excludeIONetty = ExclusionRule(organization = "io.netty")
-  val excludeEclipseJetty = ExclusionRule(organization = "org.eclipse.jetty")
-  val excludeAsm = ExclusionRule(organization = "org.ow2.asm")
-  val excludeOldAsm = ExclusionRule(organization = "asm")
-  val excludeCommonsLogging = ExclusionRule(organization = "commons-logging")
-  val excludeSLF4J = ExclusionRule(organization = "org.slf4j")
-  val excludeScalap = ExclusionRule(organization = "org.scala-lang", artifact = "scalap")
-  val excludeHadoop = ExclusionRule(organization = "org.apache.hadoop")
-  val excludeCurator = ExclusionRule(organization = "org.apache.curator")
-  val excludePowermock = ExclusionRule(organization = "org.powermock")
-  val excludeFastutil = ExclusionRule(organization = "it.unimi.dsi")
-  val excludeJruby = ExclusionRule(organization = "org.jruby")
-  val excludeThrift = ExclusionRule(organization = "org.apache.thrift")
-  val excludeServletApi = ExclusionRule(organization = "javax.servlet", artifact = "servlet-api")
-  val excludeJUnit = ExclusionRule(organization = "junit")
-
-  def sparkPreviousArtifact(id: String, organization: String = "org.apache.spark",
-      version: String = "1.0.0", crossVersion: String = "2.10"): Option[sbt.ModuleID] = {
-    val fullId = if (crossVersion.isEmpty) id else id + "_" + crossVersion
-    Some(organization % fullId % version) // the artifact to compare binary compatibility with
+  )
+
+  /** Following project only exists to pull previous artifacts of Spark for generating
+    Mima ignores. For more information see: SPARK 2071 */
+  lazy val oldDeps = Project("oldDeps", file("dev"), settings = oldDepsSettings)
+
+  def versionArtifact(id: String): Option[sbt.ModuleID] = {
+    val fullId = id + "_2.10"
+    Some("org.apache.spark" % fullId % "1.0.0")
   }
 
-  def coreSettings = sharedSettings ++ Seq(
-    name := "spark-core",
-    libraryDependencies ++= Seq(
-        "com.google.guava"           % "guava"            % "14.0.1",
-        "org.apache.commons"         % "commons-lang3"    % "3.3.2",
-        "org.apache.commons"         % "commons-math3"    % "3.3" % "test",
-        "com.google.code.findbugs"   % "jsr305"           % "1.3.9",
-        "log4j"                      % "log4j"            % "1.2.17",
-        "org.slf4j"                  % "slf4j-api"        % slf4jVersion,
-        "org.slf4j"                  % "slf4j-log4j12"    % slf4jVersion,
-        "org.slf4j"                  % "jul-to-slf4j"     % slf4jVersion,
-        "org.slf4j"                  % "jcl-over-slf4j"   % slf4jVersion,
-        "commons-daemon"             % "commons-daemon"   % "1.0.10", // workaround for bug HADOOP-9407
-        "com.ning"                   % "compress-lzf"     % "1.0.0",
-        "org.xerial.snappy"          % "snappy-java"      % "1.0.5",
-        "org.spark-project.akka"    %% "akka-remote"      % akkaVersion,
-        "org.spark-project.akka"    %% "akka-slf4j"       % akkaVersion,
-        "org.spark-project.akka"    %% "akka-testkit"     % akkaVersion % "test",
-        "org.json4s"                %% "json4s-jackson"   % "3.2.6" excludeAll(excludeScalap),
-        "colt"                       % "colt"             % "1.2.0",
-        "org.apache.mesos"           % "mesos"            % "0.18.1" classifier("shaded-protobuf") exclude("com.google.protobuf", "protobuf-java"),
-        "commons-net"                % "commons-net"      % "2.2",
-        "net.java.dev.jets3t"        % "jets3t"           % jets3tVersion excludeAll(excludeCommonsLogging),
-        "commons-codec"              % "commons-codec"    % "1.5", // Prevent jets3t from including the older version of commons-codec
-        "org.apache.derby"           % "derby"            % "10.4.2.0"                     % "test",
-        "org.apache.hadoop"          % hadoopClient       % hadoopVersion excludeAll(excludeJBossNetty, excludeAsm, excludeCommonsLogging, excludeSLF4J, excludeOldAsm, excludeServletApi),
-        "org.apache.curator"         % "curator-recipes"  % "2.4.0" excludeAll(excludeJBossNetty),
-        "com.codahale.metrics"       % "metrics-core"     % codahaleMetricsVersion,
-        "com.codahale.metrics"       % "metrics-jvm"      % codahaleMetricsVersion,
-        "com.codahale.metrics"       % "metrics-json"     % codahaleMetricsVersion,
-        "com.codahale.metrics"       % "metrics-graphite" % codahaleMetricsVersion,
-        "com.twitter"               %% "chill"            % chillVersion excludeAll(excludeAsm),
-        "com.twitter"                % "chill-java"       % chillVersion excludeAll(excludeAsm),
-        "org.tachyonproject"         % "tachyon"          % "0.4.1-thrift" excludeAll(excludeHadoop, excludeCurator, excludeEclipseJetty, excludePowermock),
-        "com.clearspring.analytics"  % "stream"           % "2.7.0" excludeAll(excludeFastutil), // Only HyperLogLogPlus is used, which does not depend on fastutil.
-        "org.spark-project"          % "pyrolite"         % "2.0.1",
-        "net.sf.py4j"                % "py4j"             % "0.8.1"
-      ),
-    libraryDependencies ++= maybeAvro,
-    assembleDeps,
-    previousArtifact := sparkPreviousArtifact("spark-core")
+  def oldDepsSettings() = Defaults.defaultSettings ++ Seq(
+    name := "old-deps",
+    scalaVersion := "2.10.4",
+    retrieveManaged := true,
+    retrievePattern := "[type]s/[artifact](-[revision])(-[classifier]).[ext]",
+    libraryDependencies := Seq("spark-streaming-mqtt", "spark-streaming-zeromq",
+      "spark-streaming-flume", "spark-streaming-kafka", "spark-streaming-twitter",
+      "spark-streaming", "spark-mllib", "spark-bagel", "spark-graphx",
+      "spark-core").map(versionArtifact(_).get intransitive())
   )
 
-  // Create a colon-separate package list adding "org.apache.spark" in front of all of them,
-  // for easier specification of JavaDoc package groups
-  def packageList(names: String*): String = {
-    names.map(s => "org.apache.spark." + s).mkString(":")
+  def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = {
+    val existingSettings = projectsMap.getOrElse(projectRef.project, Seq[Setting[_]]())
+    projectsMap += (projectRef.project -> (existingSettings ++ settings))
   }
 
-  def rootSettings = sharedSettings ++ scalaJavaUnidocSettings ++ Seq(
-    publish := {},
+  // Note ordering of these settings matter.
+  /* Enable shared settings on all projects */
+  (allProjects ++ optionallyEnabledProjects ++ assemblyProjects).foreach(enable(sharedSettings))
 
-    unidocProjectFilter in (ScalaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(repl, examples, tools, catalyst, yarn, yarnAlpha),
-    unidocProjectFilter in (JavaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(repl, examples, bagel, graphx, catalyst, tools, yarn, yarnAlpha),
+  /* Enable tests settings for all projects except examples, assembly and tools */
+  (allProjects ++ optionallyEnabledProjects).foreach(enable(TestSettings.settings))
 
-    // Skip class names containing $ and some internal packages in Javadocs
-    unidocAllSources in (JavaUnidoc, unidoc) := {
-      (unidocAllSources in (JavaUnidoc, unidoc)).value
-        .map(_.filterNot(_.getName.contains("$")))
-        .map(_.filterNot(_.getCanonicalPath.contains("akka")))
-        .map(_.filterNot(_.getCanonicalPath.contains("deploy")))
-        .map(_.filterNot(_.getCanonicalPath.contains("network")))
-        .map(_.filterNot(_.getCanonicalPath.contains("executor")))
-        .map(_.filterNot(_.getCanonicalPath.contains("python")))
-        .map(_.filterNot(_.getCanonicalPath.contains("collection")))
-    },
+  // TODO: Add Sql to mima checks
+  allProjects.filterNot(x => Seq(spark, sql, hive, hiveThriftServer, catalyst, repl,
+    streamingFlumeSink).contains(x)).foreach(x => enable(MimaBuild.mimaSettings(sparkHome, x))(x))
 
-    // Javadoc options: create a window title, and group key packages on index page
-    javacOptions in doc := Seq(
-      "-windowtitle", "Spark " + SPARK_VERSION_SHORT + " JavaDoc",
-      "-public",
-      "-group", "Core Java API", packageList("api.java", "api.java.function"),
-      "-group", "Spark Streaming", packageList(
-        "streaming.api.java", "streaming.flume", "streaming.kafka",
-        "streaming.mqtt", "streaming.twitter", "streaming.zeromq"
-      ),
-      "-group", "MLlib", packageList(
-        "mllib.classification", "mllib.clustering", "mllib.evaluation.binary", "mllib.linalg",
-        "mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", "mllib.recommendation",
-        "mllib.regression", "mllib.stat", "mllib.tree", "mllib.tree.configuration",
-        "mllib.tree.impurity", "mllib.tree.model", "mllib.util"
-      ),
-      "-group", "Spark SQL", packageList("sql.api.java", "sql.hive.api.java"),
-      "-noqualifier", "java.lang"
-    )
-  )
+  /* Enable Assembly for all assembly projects */
+  assemblyProjects.foreach(enable(Assembly.settings))
 
-  def replSettings = sharedSettings ++ Seq(
-    name := "spark-repl",
-    libraryDependencies <+= scalaVersion(v => "org.scala-lang"  % "scala-compiler" % v),
-    libraryDependencies <+= scalaVersion(v => "org.scala-lang"  % "jline"          % v),
-    libraryDependencies <+= scalaVersion(v => "org.scala-lang"  % "scala-reflect"  % v)
-  )
+  /* Enable unidoc only for the root spark project */
+  enable(Unidoc.settings)(spark)
 
-  def examplesSettings = sharedSettings ++ Seq(
-    name := "spark-examples",
-    jarName in assembly <<= version map {
-      v => "spark-examples-" + v + "-hadoop" + hadoopVersion + ".jar" },
-    libraryDependencies ++= Seq(
-      "com.twitter"          %% "algebird-core"   % "0.1.11",
-      "org.apache.hbase" % "hbase" % HBASE_VERSION excludeAll(excludeIONetty, excludeJBossNetty, excludeAsm, excludeOldAsm, excludeCommonsLogging, excludeJruby),
-      "org.apache.cassandra" % "cassandra-all" % "1.2.6"
-        exclude("com.google.guava", "guava")
-        exclude("com.googlecode.concurrentlinkedhashmap", "concurrentlinkedhashmap-lru")
-        exclude("com.ning","compress-lzf")
-        exclude("io.netty", "netty")
-        exclude("jline","jline")
-        exclude("org.apache.cassandra.deps", "avro")
-        excludeAll(excludeSLF4J, excludeIONetty),
-      "com.github.scopt" %% "scopt" % "3.2.0"
-    )
-  ) ++ assemblySettings ++ extraAssemblySettings
-
-  def toolsSettings = sharedSettings ++ Seq(
-    name := "spark-tools",
-    libraryDependencies <+= scalaVersion(v => "org.scala-lang"  % "scala-compiler" % v),
-    libraryDependencies <+= scalaVersion(v => "org.scala-lang"  % "scala-reflect"  % v )
-  ) ++ assemblySettings ++ extraAssemblySettings
-
-  def graphxSettings = sharedSettings ++ Seq(
-    name := "spark-graphx",
-    previousArtifact := sparkPreviousArtifact("spark-graphx"),
-    libraryDependencies ++= Seq(
-      "org.jblas" % "jblas" % jblasVersion
-    )
-  )
+  /* Catalyst macro settings */
+  enable(Catalyst.settings)(catalyst)
 
-  def bagelSettings = sharedSettings ++ Seq(
-    name := "spark-bagel",
-    previousArtifact := sparkPreviousArtifact("spark-bagel")
-  )
+  /* Spark SQL Core console settings */
+  enable(SQL.settings)(sql)
 
-  def mllibSettings = sharedSettings ++ Seq(
-    name := "spark-mllib",
-    previousArtifact := sparkPreviousArtifact("spark-mllib"),
-    libraryDependencies ++= Seq(
-      "org.jblas" % "jblas" % jblasVersion,
-      "org.scalanlp" %% "breeze" % "0.7" excludeAll(excludeJUnit)
-    )
-  )
+  /* Hive console settings */
+  enable(Hive.settings)(hive)
 
-  def catalystSettings = sharedSettings ++ Seq(
-    name := "catalyst",
-    // The mechanics of rewriting expression ids to compare trees in some test cases makes
-    // assumptions about the the expression ids being contiguous.  Running tests in parallel breaks
-    // this non-deterministically.  TODO: FIX THIS.
-    parallelExecution in Test := false,
-    libraryDependencies ++= Seq(
-      "com.typesafe" %% "scalalogging-slf4j" % "1.0.1"
-    )
-  )
+  enable(Flume.settings)(streamingFlumeSink)
+
+  // TODO: move this to its upstream project.
+  override def projectDefinitions(baseDirectory: File): Seq[Project] = {
+    super.projectDefinitions(baseDirectory).map { x =>
+      if (projectsMap.exists(_._1 == x.id)) x.settings(projectsMap(x.id): _*)
+      else x.settings(Seq[Setting[_]](): _*)
+    } ++ Seq[Project](oldDeps)
+  }
 
-  def sqlCoreSettings = sharedSettings ++ Seq(
-    name := "spark-sql",
-    libraryDependencies ++= Seq(
-      "com.twitter"                  % "parquet-column"             % parquetVersion,
-      "com.twitter"                  % "parquet-hadoop"             % parquetVersion,
-      "com.fasterxml.jackson.core"   % "jackson-databind"           % "2.3.0" // json4s-jackson 3.2.6 requires jackson-databind 2.3.0.
-    ),
+}
+
+object Flume {
+  lazy val settings = sbtavro.SbtAvro.avroSettings
+}
+
+object Catalyst {
+  lazy val settings = Seq(
+    addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full),
+    // Quasiquotes break compiling scala doc...
+    // TODO: Investigate fixing this.
+    sources in (Compile, doc) ~= (_ filter (_.getName contains "codegen")))
+}
+
+object SQL {
+  lazy val settings = Seq(
     initialCommands in console :=
       """
         |import org.apache.spark.sql.catalyst.analysis._
@@ -521,18 +216,14 @@ object SparkBuild extends Build {
         |import org.apache.spark.sql.test.TestSQLContext._
         |import org.apache.spark.sql.parquet.ParquetTestData""".stripMargin
   )
+}
+
+object Hive {
+
+  lazy val settings = Seq(
 
-  // Since we don't include hive in the main assembly this project also acts as an alternative
-  // assembly jar.
-  def hiveSettings = sharedSettings ++ Seq(
-    name := "spark-hive",
     javaOptions += "-XX:MaxPermSize=1g",
-    libraryDependencies ++= Seq(
-      "org.spark-project.hive" % "hive-metastore" % hiveVersion,
-      "org.spark-project.hive" % "hive-exec"      % hiveVersion excludeAll(excludeCommonsLogging),
-      "org.spark-project.hive" % "hive-serde"     % hiveVersion
-    ),
-    // Multiple queries rely on the TestHive singleton.  See comments there for more details.
+    // Multiple queries rely on the TestHive singleton. See comments there for more details.
     parallelExecution in Test := false,
     // Supporting all SerDes requires us to depend on deprecated APIs, so we turn off the warnings
     // only for this subproject.
@@ -555,67 +246,16 @@ object SparkBuild extends Build {
         |import org.apache.spark.sql.parquet.ParquetTestData""".stripMargin
   )
 
-  def streamingSettings = sharedSettings ++ Seq(
-    name := "spark-streaming",
-    previousArtifact := sparkPreviousArtifact("spark-streaming")
-  )
-
-  def yarnCommonSettings = sharedSettings ++ Seq(
-    unmanagedSourceDirectories in Compile <++= baseDirectory { base =>
-      Seq(
-         base / "../common/src/main/scala"
-      )
-    },
-
-    unmanagedSourceDirectories in Test <++= baseDirectory { base =>
-      Seq(
-         base / "../common/src/test/scala"
-      )
-    }
-
-  ) ++ extraYarnSettings
-
-  def yarnAlphaSettings = yarnCommonSettings ++ Seq(
-    name := "spark-yarn-alpha"
-  )
-
-  def yarnSettings = yarnCommonSettings ++ Seq(
-    name := "spark-yarn"
-  )
-
-  def gangliaSettings = sharedSettings ++ Seq(
-    name := "spark-ganglia-lgpl",
-    libraryDependencies += "com.codahale.metrics" % "metrics-ganglia" % "3.0.0"
-  )
-
-  def java8TestsSettings = sharedSettings ++ Seq(
-    name := "java8-tests",
-    javacOptions := Seq("-target", "1.8", "-source", "1.8"),
-    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a")
-  )
-
-  // Conditionally include the YARN dependencies because some tools look at all sub-projects and will complain
-  // if we refer to nonexistent dependencies (e.g. hadoop-yarn-api from a Hadoop version without YARN).
-  def extraYarnSettings = if(isYarnEnabled) yarnEnabledSettings else Seq()
-
-  def yarnEnabledSettings = Seq(
-    libraryDependencies ++= Seq(
-      // Exclude rule required for all ?
-      "org.apache.hadoop" % hadoopClient         % hadoopVersion excludeAll(excludeJBossNetty, excludeAsm, excludeOldAsm),
-      "org.apache.hadoop" % "hadoop-yarn-api"    % hadoopVersion excludeAll(excludeJBossNetty, excludeAsm, excludeOldAsm, excludeCommonsLogging),
-      "org.apache.hadoop" % "hadoop-yarn-common" % hadoopVersion excludeAll(excludeJBossNetty, excludeAsm, excludeOldAsm, excludeCommonsLogging),
-      "org.apache.hadoop" % "hadoop-yarn-client" % hadoopVersion excludeAll(excludeJBossNetty, excludeAsm, excludeOldAsm, excludeCommonsLogging),
-      "org.apache.hadoop" % "hadoop-yarn-server-web-proxy" % hadoopVersion excludeAll(excludeJBossNetty, excludeAsm, excludeOldAsm, excludeCommonsLogging, excludeServletApi)
-    )
-  )
+}
 
-  def assemblyProjSettings = sharedSettings ++ Seq(
-    name := "spark-assembly",
-    jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" }
-  ) ++ assemblySettings ++ extraAssemblySettings
+object Assembly {
+  import sbtassembly.Plugin._
+  import AssemblyKeys._
 
-  def extraAssemblySettings() = Seq(
+  lazy val settings = assemblySettings ++ Seq(
     test in assembly := {},
+    jarName in assembly <<= (version, moduleName) map { (v, mName) => mName + "-"+v + "-hadoop" +
+      Option(System.getProperty("hadoop.version")).getOrElse("1.0.4") + ".jar" },
     mergeStrategy in assembly := {
       case PathList("org", "datanucleus", xs @ _*)             => MergeStrategy.discard
       case m if m.toLowerCase.endsWith("manifest.mf")          => MergeStrategy.discard
@@ -627,57 +267,96 @@ object SparkBuild extends Build {
     }
   )
 
-  def oldDepsSettings() = Defaults.defaultSettings ++ Seq(
-    name := "old-deps",
-    scalaVersion := "2.10.4",
-    retrieveManaged := true,
-    retrievePattern := "[type]s/[artifact](-[revision])(-[classifier]).[ext]",
-    libraryDependencies := Seq("spark-streaming-mqtt", "spark-streaming-zeromq",
-      "spark-streaming-flume", "spark-streaming-kafka", "spark-streaming-twitter",
-      "spark-streaming", "spark-mllib", "spark-bagel", "spark-graphx",
-      "spark-core").map(sparkPreviousArtifact(_).get intransitive())
-  )
+}
 
-  def twitterSettings() = sharedSettings ++ Seq(
-    name := "spark-streaming-twitter",
-    previousArtifact := sparkPreviousArtifact("spark-streaming-twitter"),
-    libraryDependencies ++= Seq(
-      "org.twitter4j" % "twitter4j-stream" % "3.0.3"
-    )
-  )
+object Unidoc {
 
-  def kafkaSettings() = sharedSettings ++ Seq(
-    name := "spark-streaming-kafka",
-    previousArtifact := sparkPreviousArtifact("spark-streaming-kafka"),
-    libraryDependencies ++= Seq(
-      "com.github.sgroschupf"    % "zkclient"   % "0.1",
-      "org.apache.kafka"        %% "kafka"      % "0.8.0"
-        exclude("com.sun.jdmk", "jmxtools")
-        exclude("com.sun.jmx", "jmxri")
-        exclude("net.sf.jopt-simple", "jopt-simple")
-        excludeAll(excludeSLF4J)
-    )
-  )
+  import BuildCommons._
+  import sbtunidoc.Plugin._
+  import UnidocKeys._
+
+  // for easier specification of JavaDoc package groups
+  private def packageList(names: String*): String = {
+    names.map(s => "org.apache.spark." + s).mkString(":")
+  }
+
+  lazy val settings = scalaJavaUnidocSettings ++ Seq (
+    publish := {},
 
-  def flumeSettings() = sharedSettings ++ Seq(
-    name := "spark-streaming-flume",
-    previousArtifact := sparkPreviousArtifact("spark-streaming-flume"),
-    libraryDependencies ++= Seq(
-      "org.apache.flume" % "flume-ng-sdk" % "1.4.0" % "compile" excludeAll(excludeIONetty, excludeThrift)
+    unidocProjectFilter in(ScalaUnidoc, unidoc) :=
+      inAnyProject -- inProjects(repl, examples, tools, catalyst, yarn, yarnAlpha),
+    unidocProjectFilter in(JavaUnidoc, unidoc) :=
+      inAnyProject -- inProjects(repl, bagel, graphx, examples, tools, catalyst, yarn, yarnAlpha),
+
+    // Skip class names containing $ and some internal packages in Javadocs
+    unidocAllSources in (JavaUnidoc, unidoc) := {
+      (unidocAllSources in (JavaUnidoc, unidoc)).value
+        .map(_.filterNot(_.getName.contains("$")))
+        .map(_.filterNot(_.getCanonicalPath.contains("akka")))
+        .map(_.filterNot(_.getCanonicalPath.contains("deploy")))
+        .map(_.filterNot(_.getCanonicalPath.contains("network")))
+        .map(_.filterNot(_.getCanonicalPath.contains("shuffle")))
+        .map(_.filterNot(_.getCanonicalPath.contains("executor")))
+        .map(_.filterNot(_.getCanonicalPath.contains("python")))
+        .map(_.filterNot(_.getCanonicalPath.contains("collection")))
+    },
+
+    // Javadoc options: create a window title, and group key packages on index page
+    javacOptions in doc := Seq(
+      "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc",
+      "-public",
+      "-group", "Core Java API", packageList("api.java", "api.java.function"),
+      "-group", "Spark Streaming", packageList(
+        "streaming.api.java", "streaming.flume", "streaming.kafka",
+        "streaming.mqtt", "streaming.twitter", "streaming.zeromq"
+      ),
+      "-group", "MLlib", packageList(
+        "mllib.classification", "mllib.clustering", "mllib.evaluation.binary", "mllib.linalg",
+        "mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", "mllib.recommendation",
+        "mllib.regression", "mllib.stat", "mllib.tree", "mllib.tree.configuration",
+        "mllib.tree.impurity", "mllib.tree.model", "mllib.util"
+      ),
+      "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", "sql.hive.api.java"),
+      "-noqualifier", "java.lang"
     )
   )
+}
+
+object TestSettings {
+  import BuildCommons._
 
-  def zeromqSettings() = sharedSettings ++ Seq(
-    name := "spark-streaming-zeromq",
-    previousArtifact := sparkPreviousArtifact("spark-streaming-zeromq"),
-    libraryDependencies ++= Seq(
-      "org.spark-project.akka" %% "akka-zeromq" % akkaVersion
+  lazy val settings = Seq (
+    // Fork new JVMs for tests and set Java options for those
+    fork := true,
+    javaOptions in Test += "-Dspark.test.home=" + sparkHome,
+    javaOptions in Test += "-Dspark.testing=1",
+    javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
+    javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")
+      .map { case (k,v) => s"-D$k=$v" }.toSeq,
+    javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
+      .split(" ").toSeq,
+    javaOptions += "-Xmx3g",
+
+    // Show full stack trace and duration in test cases.
+    testOptions in Test += Tests.Argument("-oDF"),
+    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
+    // Enable Junit testing.
+    libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test",
+    // Only allow one test at a time, even across projects, since they run in the same JVM
+    parallelExecution in Test := false,
+    concurrentRestrictions in Global += Tags.limit(Tags.Test, 1),
+    // Remove certain packages from Scaladoc
+    scalacOptions in (Compile, doc) := Seq(
+      "-groups",
+      "-skip-packages", Seq(
+        "akka",
+        "org.apache.spark.api.python",
+        "org.apache.spark.network",
+        "org.apache.spark.deploy",
+        "org.apache.spark.util.collection"
+      ).mkString(":"),
+      "-doc-title", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " ScalaDoc"
     )
   )
 
-  def mqttSettings() = streamingSettings ++ Seq(
-    name := "spark-streaming-mqtt",
-    previousArtifact := sparkPreviousArtifact("spark-streaming-mqtt"),
-    libraryDependencies ++= Seq("org.eclipse.paho" % "mqtt-client" % "0.4.0")
-  )
 }
diff --git a/project/build.properties b/project/build.properties
index bcde13f4362a7..c12ef652adfcb 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-sbt.version=0.13.2
+sbt.version=0.13.5
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 472819b9fb8ba..06d18e193076e 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -21,6 +21,8 @@ addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.4.0")
 
 addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6")
 
-addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.0")
+addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.1")
 
 addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.0")
+
+addSbtPlugin("com.cavorite" % "sbt-avro" % "0.3.2")
diff --git a/project/project/SparkPluginBuild.scala b/project/project/SparkPluginBuild.scala
index e9fba641eb8a1..3ef2d5451da0d 100644
--- a/project/project/SparkPluginBuild.scala
+++ b/project/project/SparkPluginBuild.scala
@@ -24,8 +24,10 @@ import sbt.Keys._
  * becomes available for scalastyle sbt plugin.
  */
 object SparkPluginDef extends Build {
-  lazy val root = Project("plugins", file(".")) dependsOn(sparkStyle)
+  lazy val root = Project("plugins", file(".")) dependsOn(sparkStyle, sbtPomReader)
   lazy val sparkStyle = Project("spark-style", file("spark-style"), settings = styleSettings)
+  lazy val sbtPomReader = uri("https://github.com/ScrapCodes/sbt-pom-reader.git")
+
   // There is actually no need to publish this artifact.
   def styleSettings = Defaults.defaultSettings ++ Seq (
     name                 :=  "spark-style",
diff --git a/python/epydoc.conf b/python/epydoc.conf
index b73860bad8263..51c0faf359939 100644
--- a/python/epydoc.conf
+++ b/python/epydoc.conf
@@ -35,4 +35,4 @@ private: no
 exclude: pyspark.cloudpickle pyspark.worker pyspark.join
          pyspark.java_gateway pyspark.examples pyspark.shell pyspark.tests
          pyspark.rddsampler pyspark.daemon pyspark.mllib._common
-         pyspark.mllib.tests
+         pyspark.mllib.tests pyspark.shuffle
diff --git a/python/lib/py4j-0.8.1-src.zip b/python/lib/py4j-0.8.1-src.zip
deleted file mode 100644
index 2069a328d1f2e..0000000000000
Binary files a/python/lib/py4j-0.8.1-src.zip and /dev/null differ
diff --git a/python/lib/py4j-0.8.2.1-src.zip b/python/lib/py4j-0.8.2.1-src.zip
new file mode 100644
index 0000000000000..5203b84d9119e
Binary files /dev/null and b/python/lib/py4j-0.8.2.1-src.zip differ
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 07df8697bd1a8..c58555fc9d2c5 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -49,6 +49,16 @@
       Main entry point for accessing data stored in Apache Hive..
 """
 
+# The following block allows us to import python's random instead of mllib.random for scripts in
+# mllib that depend on top level pyspark packages, which transitively depend on python's random.
+# Since Python's import logic looks for modules in the current package first, we eliminate
+# mllib.random as a candidate for C{import random} by removing the first search path, the script's
+# location, in order to force the loader to look in Python's top-level modules for C{random}.
+import sys
+s = sys.path.pop(0)
+import random
+sys.path.insert(0, s)
+
 from pyspark.conf import SparkConf
 from pyspark.context import SparkContext
 from pyspark.sql import SQLContext
@@ -59,4 +69,5 @@
 from pyspark.storagelevel import StorageLevel
 
 
-__all__ = ["SparkConf", "SparkContext", "SQLContext", "RDD", "SchemaRDD", "SparkFiles", "StorageLevel", "Row"]
+__all__ = ["SparkConf", "SparkContext", "SQLContext", "RDD", "SchemaRDD",
+           "SparkFiles", "StorageLevel", "Row"]
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 2204e9c9ca701..45d36e5d0e764 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -86,6 +86,7 @@
 Exception:...
 """
 
+import select
 import struct
 import SocketServer
 import threading
@@ -209,19 +210,38 @@ def addInPlace(self, value1, value2):
 
 
 class _UpdateRequestHandler(SocketServer.StreamRequestHandler):
+    """
+    This handler will keep polling updates from the same socket until the
+    server is shutdown.
+    """
+
     def handle(self):
         from pyspark.accumulators import _accumulatorRegistry
-        num_updates = read_int(self.rfile)
-        for _ in range(num_updates):
-            (aid, update) = pickleSer._read_with_length(self.rfile)
-            _accumulatorRegistry[aid] += update
-        # Write a byte in acknowledgement
-        self.wfile.write(struct.pack("!b", 1))
+        while not self.server.server_shutdown:
+            # Poll every 1 second for new data -- don't block in case of shutdown.
+            r, _, _ = select.select([self.rfile], [], [], 1)
+            if self.rfile in r:
+                num_updates = read_int(self.rfile)
+                for _ in range(num_updates):
+                    (aid, update) = pickleSer._read_with_length(self.rfile)
+                    _accumulatorRegistry[aid] += update
+                # Write a byte in acknowledgement
+                self.wfile.write(struct.pack("!b", 1))
+
+class AccumulatorServer(SocketServer.TCPServer):
+    """
+    A simple TCP server that intercepts shutdown() in order to interrupt
+    our continuous polling on the handler.
+    """
+    server_shutdown = False
 
+    def shutdown(self):
+        self.server_shutdown = True
+        SocketServer.TCPServer.shutdown(self)
 
 def _start_update_server():
     """Start a TCP server to receive accumulator updates in a daemon thread, and returns it"""
-    server = SocketServer.TCPServer(("localhost", 0), _UpdateRequestHandler)
+    server = AccumulatorServer(("localhost", 0), _UpdateRequestHandler)
     thread = threading.Thread(target=server.serve_forever)
     thread.daemon = True
     thread.start()
diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index eb5dbb8de2b39..68062483dedaa 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -243,10 +243,10 @@ def save_function(self, obj, name=None, pack=struct.pack):
         # if func is lambda, def'ed at prompt, is in main, or is nested, then
         # we'll pickle the actual function object rather than simply saving a
         # reference (as is done in default pickler), via save_function_tuple.
-        if islambda(obj) or obj.func_code.co_filename == '<stdin>' or themodule == None:
+        if islambda(obj) or obj.func_code.co_filename == '<stdin>' or themodule is None:
             #Force server to import modules that have been imported in main
             modList = None
-            if themodule == None and not self.savedForceImports:
+            if themodule is None and not self.savedForceImports:
                 mainmod = sys.modules['__main__']
                 if useForcedImports and hasattr(mainmod,'___pyc_forcedImports__'):
                     modList = list(mainmod.___pyc_forcedImports__)
@@ -560,8 +560,9 @@ class ItemGetterType(ctypes.Structure):
             ]
 
 
-        itemgetter_obj = ctypes.cast(ctypes.c_void_p(id(obj)), ctypes.POINTER(ItemGetterType)).contents
-        return self.save_reduce(operator.itemgetter, (itemgetter_obj.item,))
+        obj = ctypes.cast(ctypes.c_void_p(id(obj)), ctypes.POINTER(ItemGetterType)).contents
+        return self.save_reduce(operator.itemgetter,
+                obj.item if obj.nitems > 1 else (obj.item,))
 
     if PyObject_HEAD:
         dispatch[operator.itemgetter] = save_itemgetter
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index 8eff4a242a529..b4c82f519bd53 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -30,7 +30,7 @@
 u'local'
 >>> sc.appName
 u'My app'
->>> sc.sparkHome == None
+>>> sc.sparkHome is None
 True
 
 >>> conf = SparkConf(loadDefaults=False)
@@ -50,7 +50,8 @@
 spark.executorEnv.VAR4=value4
 spark.home=/path
 >>> sorted(conf.getAll(), key=lambda p: p[0])
-[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), (u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')]
+[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), \
+(u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')]
 """
 
 
@@ -99,6 +100,12 @@ def set(self, key, value):
         self._jconf.set(key, unicode(value))
         return self
 
+    def setIfMissing(self, key, value):
+        """Set a configuration property, if not already set."""
+        if self.get(key) is None:
+            self.set(key, value)
+        return self
+
     def setMaster(self, value):
         """Set master URL to connect to."""
         self._jconf.setMaster(value)
@@ -116,11 +123,11 @@ def setSparkHome(self, value):
 
     def setExecutorEnv(self, key=None, value=None, pairs=None):
         """Set an environment variable to be passed to executors."""
-        if (key != None and pairs != None) or (key == None and pairs == None):
+        if (key is not None and pairs is not None) or (key is None and pairs is None):
             raise Exception("Either pass one key-value pair or a list of pairs")
-        elif key != None:
+        elif key is not None:
             self._jconf.setExecutorEnv(key, value)
-        elif pairs != None:
+        elif pairs is not None:
             for (k, v) in pairs:
                 self._jconf.setExecutorEnv(k, v)
         return self
@@ -137,7 +144,7 @@ def setAll(self, pairs):
 
     def get(self, key, defaultValue=None):
         """Get the configured value for some key, or return a default otherwise."""
-        if defaultValue == None:   # Py4J doesn't call the right get() if we pass None
+        if defaultValue is None:   # Py4J doesn't call the right get() if we pass None
             if not self._jconf.contains(key):
                 return None
             return self._jconf.get(key)
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 95c54e7a5ad63..2e80eb50f2207 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -29,7 +29,7 @@
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
 from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \
-        PairDeserializer
+    PairDeserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark import rdd
 from pyspark.rdd import RDD
@@ -37,6 +37,15 @@
 from py4j.java_collections import ListConverter
 
 
+# These are special default configs for PySpark, they will overwrite
+# the default ones for Spark if they are not configured by user.
+DEFAULT_CONFIGS = {
+    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
+    "spark.serializer.objectStreamReset": 100,
+    "spark.rdd.compress": True,
+}
+
+
 class SparkContext(object):
     """
     Main entry point for Spark functionality. A SparkContext represents the
@@ -50,12 +59,12 @@ class SparkContext(object):
     _next_accum_id = 0
     _active_spark_context = None
     _lock = Lock()
-    _python_includes = None # zip and egg files that need to be added to PYTHONPATH
-
+    _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
+    _default_batch_size_for_serialized_input = 10
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
-        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
-        gateway=None):
+                 environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
+                 gateway=None):
         """
         Create a new SparkContext. At least the master and app name should be set,
         either through the named parameters here or through C{conf}.
@@ -75,7 +84,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         @param serializer: The serializer for RDDs.
         @param conf: A L{SparkConf} object setting Spark properties.
         @param gateway: Use an existing gateway and JVM, otherwise a new JVM
-               will be instatiated.
+               will be instantiated.
 
 
         >>> from pyspark.context import SparkContext
@@ -92,7 +101,16 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
             tempNamedTuple = namedtuple("Callsite", "function file linenum")
             self._callsite = tempNamedTuple(function=None, file=None, linenum=None)
         SparkContext._ensure_initialized(self, gateway=gateway)
-
+        try:
+            self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
+                          conf)
+        except:
+            # If an error occurs, clean up in order to allow future SparkContext creation:
+            self.stop()
+            raise
+
+    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
+                 conf):
         self.environment = environment or {}
         self._conf = conf or SparkConf(_jvm=self._jvm)
         self._batchSize = batchSize  # -1 represents an unlimited batch size
@@ -113,6 +131,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         if environment:
             for key, value in environment.iteritems():
                 self._conf.setExecutorEnv(key, value)
+        for key, value in DEFAULT_CONFIGS.items():
+            self._conf.setIfMissing(key, value)
 
         # Check that we have at least the required parameters
         if not self._conf.contains("spark.master"):
@@ -138,8 +158,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         self._accumulatorServer = accumulators._start_update_server()
         (host, port) = self._accumulatorServer.server_address
         self._javaAccumulator = self._jsc.accumulator(
-                self._jvm.java.util.ArrayList(),
-                self._jvm.PythonAccumulatorParam(host, port))
+            self._jvm.java.util.ArrayList(),
+            self._jvm.PythonAccumulatorParam(host, port))
 
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
 
@@ -165,7 +185,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                 (dirname, filename) = os.path.split(path)
                 self._python_includes.append(filename)
                 sys.path.append(path)
-                if not dirname in sys.path:
+                if dirname not in sys.path:
                     sys.path.append(dirname)
 
         # Create a temporary directory inside spark.local.dir:
@@ -192,15 +212,19 @@ def _ensure_initialized(cls, instance=None, gateway=None):
                 SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile
 
             if instance:
-                if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
+                if (SparkContext._active_spark_context and
+                   SparkContext._active_spark_context != instance):
                     currentMaster = SparkContext._active_spark_context.master
                     currentAppName = SparkContext._active_spark_context.appName
                     callsite = SparkContext._active_spark_context._callsite
 
                     # Raise error if there is already a running Spark context
-                    raise ValueError("Cannot run multiple SparkContexts at once; existing SparkContext(app=%s, master=%s)" \
-                        " created by %s at %s:%s " \
-                        % (currentAppName, currentMaster, callsite.function, callsite.file, callsite.linenum))
+                    raise ValueError(
+                        "Cannot run multiple SparkContexts at once; "
+                        "existing SparkContext(app=%s, master=%s)"
+                        " created by %s at %s:%s "
+                        % (currentAppName, currentMaster,
+                            callsite.function, callsite.file, callsite.linenum))
                 else:
                     SparkContext._active_spark_context = instance
 
@@ -213,6 +237,13 @@ def setSystemProperty(cls, key, value):
         SparkContext._ensure_initialized()
         SparkContext._jvm.java.lang.System.setProperty(key, value)
 
+    @property
+    def version(self):
+        """
+        The version of Spark on which this application is running.
+        """
+        return self._jsc.version()
+
     @property
     def defaultParallelism(self):
         """
@@ -228,17 +259,14 @@ def defaultMinPartitions(self):
         """
         return self._jsc.sc().defaultMinPartitions()
 
-    def __del__(self):
-        self.stop()
-
     def stop(self):
         """
         Shut down the SparkContext.
         """
-        if self._jsc:
+        if getattr(self, "_jsc", None):
             self._jsc.stop()
             self._jsc = None
-        if self._accumulatorServer:
+        if getattr(self, "_accumulatorServer", None):
             self._accumulatorServer.shutdown()
             self._accumulatorServer = None
         with SparkContext._lock:
@@ -290,7 +318,7 @@ def textFile(self, name, minPartitions=None):
         Read a text file from HDFS, a local file system (available on all
         nodes), or any Hadoop-supported file system URI, and return it as an
         RDD of Strings.
-        
+
         >>> path = os.path.join(tempdir, "sample-text.txt")
         >>> with open(path, "w") as testFile:
         ...    testFile.write("Hello world!")
@@ -351,7 +379,7 @@ def _dictToJavaMap(self, d):
         return jm
 
     def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,
-                     valueConverter=None, minSplits=None):
+                     valueConverter=None, minSplits=None, batchSize=None):
         """
         Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
         a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -371,14 +399,18 @@ def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,
         @param valueConverter:
         @param minSplits: minimum splits in dataset
                (default min(2, sc.defaultParallelism))
+        @param batchSize: The number of Python objects represented as a single
+               Java object. (default sc._default_batch_size_for_serialized_input)
         """
         minSplits = minSplits or min(self.defaultParallelism, 2)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass,
-                                                keyConverter, valueConverter, minSplits)
-        return RDD(jrdd, self, PickleSerializer())
+                    keyConverter, valueConverter, minSplits, batchSize)
+        return RDD(jrdd, self, ser)
 
     def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
-                         valueConverter=None, conf=None):
+                         valueConverter=None, conf=None, batchSize=None):
         """
         Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
         a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -398,14 +430,18 @@ def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConv
         @param valueConverter: (None by default)
         @param conf: Hadoop configuration, passed in as a dict
                (None by default)
+        @param batchSize: The number of Python objects represented as a single
+               Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,
-                                                    valueClass, keyConverter, valueConverter, jconf)
-        return RDD(jrdd, self, PickleSerializer())
+                    valueClass, keyConverter, valueConverter, jconf, batchSize)
+        return RDD(jrdd, self, ser)
 
     def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
-                        valueConverter=None, conf=None):
+                        valueConverter=None, conf=None, batchSize=None):
         """
         Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
         Hadoop configuration, which is passed in as a Python dict.
@@ -422,14 +458,18 @@ def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=N
         @param valueConverter: (None by default)
         @param conf: Hadoop configuration, passed in as a dict
                (None by default)
+        @param batchSize: The number of Python objects represented as a single
+               Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
-                                                   valueClass, keyConverter, valueConverter, jconf)
-        return RDD(jrdd, self, PickleSerializer())
+                    valueClass, keyConverter, valueConverter, jconf, batchSize)
+        return RDD(jrdd, self, ser)
 
     def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
-                   valueConverter=None, conf=None):
+                   valueConverter=None, conf=None, batchSize=None):
         """
         Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
         a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -449,14 +489,18 @@ def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=
         @param valueConverter: (None by default)
         @param conf: Hadoop configuration, passed in as a dict
                (None by default)
+        @param batchSize: The number of Python objects represented as a single
+               Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass,
-                                              valueClass, keyConverter, valueConverter, jconf)
-        return RDD(jrdd, self, PickleSerializer())
+                    valueClass, keyConverter, valueConverter, jconf, batchSize)
+        return RDD(jrdd, self, ser)
 
     def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
-                  valueConverter=None, conf=None):
+                  valueConverter=None, conf=None, batchSize=None):
         """
         Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
         Hadoop configuration, which is passed in as a Python dict.
@@ -473,11 +517,15 @@ def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
         @param valueConverter: (None by default)
         @param conf: Hadoop configuration, passed in as a dict
                (None by default)
+        @param batchSize: The number of Python objects represented as a single
+               Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass,
-                                             keyConverter, valueConverter, jconf)
-        return RDD(jrdd, self, PickleSerializer())
+                    keyConverter, valueConverter, jconf, batchSize)
+        return RDD(jrdd, self, ser)
 
     def _checkpointFile(self, name, input_deserializer):
         jrdd = self._jsc.checkpointFile(name)
@@ -584,11 +632,12 @@ def addPyFile(self, path):
         HTTP, HTTPS or FTP URI.
         """
         self.addFile(path)
-        (dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix
+        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
 
         if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
             self._python_includes.append(filename)
-            sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename)) # for tests in local mode
+            # for tests in local mode
+            sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename))
 
     def setCheckpointDir(self, dirName):
         """
@@ -649,9 +698,9 @@ def setJobGroup(self, groupId, description, interruptOnCancel=False):
         Cancelled
 
         If interruptOnCancel is set to true for the job group, then job cancellation will result
-        in Thread.interrupt() being called on the job's executor threads. This is useful to help ensure
-        that the tasks are actually stopped in a timely manner, but is off by default due to HDFS-1208,
-        where HDFS may respond to Thread.interrupt() by marking nodes as dead.
+        in Thread.interrupt() being called on the job's executor threads. This is useful to help
+        ensure that the tasks are actually stopped in a timely manner, but is off by default due
+        to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead.
         """
         self._jsc.setJobGroup(groupId, description, interruptOnCancel)
 
@@ -688,7 +737,7 @@ def cancelAllJobs(self):
         """
         self._jsc.sc().cancelAllJobs()
 
-    def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
+    def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         """
         Executes the given partitionFunc on the specified set of partitions,
         returning the result as an array of elements.
@@ -703,7 +752,7 @@ def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
         >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)
         [0, 1, 16, 25]
         """
-        if partitions == None:
+        if partitions is None:
             partitions = range(rdd._jrdd.partitions().size())
         javaPartitions = ListConverter().convert(partitions, self._gateway._gateway_client)
 
@@ -714,6 +763,7 @@ def runJob(self, rdd, partitionFunc, partitions = None, allowLocal = False):
         it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
         return list(mappedRDD._collect_iterator_through_file(it))
 
+
 def _test():
     import atexit
     import doctest
diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
index 5eb1c63bf206b..b00da833d06f1 100644
--- a/python/pyspark/daemon.py
+++ b/python/pyspark/daemon.py
@@ -15,64 +15,39 @@
 # limitations under the License.
 #
 
+import numbers
 import os
 import signal
+import select
 import socket
 import sys
 import traceback
-import multiprocessing
-from ctypes import c_bool
 from errno import EINTR, ECHILD
 from socket import AF_INET, SOCK_STREAM, SOMAXCONN
 from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN
 from pyspark.worker import main as worker_main
-from pyspark.serializers import write_int
-
-try:
-    POOLSIZE = multiprocessing.cpu_count()
-except NotImplementedError:
-    POOLSIZE = 4
-
-exit_flag = multiprocessing.Value(c_bool, False)
-
-
-def should_exit():
-    global exit_flag
-    return exit_flag.value
+from pyspark.serializers import read_int, write_int
 
 
 def compute_real_exit_code(exit_code):
-  # SystemExit's code can be integer or string, but os._exit only accepts integers
-  import numbers
-  if isinstance(exit_code, numbers.Integral):
-    return exit_code
-  else:
-    return 1
+    # SystemExit's code can be integer or string, but os._exit only accepts integers
+    if isinstance(exit_code, numbers.Integral):
+        return exit_code
+    else:
+        return 1
 
 
-def worker(listen_sock):
+def worker(sock):
+    """
+    Called by a worker process after the fork().
+    """
     # Redirect stdout to stderr
     os.dup2(2, 1)
     sys.stdout = sys.stderr   # The sys.stdout object is different from file descriptor 1
 
-    # Manager sends SIGHUP to request termination of workers in the pool
-    def handle_sighup(*args):
-        assert should_exit()
-    signal.signal(SIGHUP, handle_sighup)
-
-    # Cleanup zombie children
-    def handle_sigchld(*args):
-        pid = status = None
-        try:
-            while (pid, status) != (0, 0):
-                pid, status = os.waitpid(0, os.WNOHANG)
-        except EnvironmentError as err:
-            if err.errno == EINTR:
-                # retry
-                handle_sigchld()
-            elif err.errno != ECHILD:
-                raise
-    signal.signal(SIGCHLD, handle_sigchld)
+    signal.signal(SIGHUP, SIG_DFL)
+    signal.signal(SIGCHLD, SIG_DFL)
+    signal.signal(SIGTERM, SIG_DFL)
 
     # Blocks until the socket is closed by draining the input stream
     # until it raises an exception or returns EOF.
@@ -85,55 +60,24 @@ def waitSocketClose(sock):
         except:
             pass
 
-    # Handle clients
-    while not should_exit():
-        # Wait until a client arrives or we have to exit
-        sock = None
-        while not should_exit() and sock is None:
-            try:
-                sock, addr = listen_sock.accept()
-            except EnvironmentError as err:
-                if err.errno != EINTR:
-                    raise
-
-        if sock is not None:
-            # Fork a child to handle the client.
-            # The client is handled in the child so that the manager
-            # never receives SIGCHLD unless a worker crashes.
-            if os.fork() == 0:
-                # Leave the worker pool
-                signal.signal(SIGHUP, SIG_DFL)
-                signal.signal(SIGCHLD, SIG_DFL)
-                listen_sock.close()
-                # Read the socket using fdopen instead of socket.makefile() because the latter
-                # seems to be very slow; note that we need to dup() the file descriptor because
-                # otherwise writes also cause a seek that makes us miss data on the read side.
-                infile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
-                outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
-                exit_code = 0
-                try:
-                    worker_main(infile, outfile)
-                except SystemExit as exc:
-                    exit_code = exc.code
-                finally:
-                    outfile.flush()
-                    # The Scala side will close the socket upon task completion.
-                    waitSocketClose(sock)
-                    os._exit(compute_real_exit_code(exit_code))
-            else:
-                sock.close()
-
-
-def launch_worker(listen_sock):
-    if os.fork() == 0:
-        try:
-            worker(listen_sock)
-        except Exception as err:
-            traceback.print_exc()
-            os._exit(1)
-        else:
-            assert should_exit()
-            os._exit(0)
+    # Read the socket using fdopen instead of socket.makefile() because the latter
+    # seems to be very slow; note that we need to dup() the file descriptor because
+    # otherwise writes also cause a seek that makes us miss data on the read side.
+    infile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
+    outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
+    exit_code = 0
+    try:
+        # Acknowledge that the fork was successful
+        write_int(os.getpid(), outfile)
+        outfile.flush()
+        worker_main(infile, outfile)
+    except SystemExit as exc:
+        exit_code = exc.code
+    finally:
+        outfile.flush()
+        # The Scala side will close the socket upon task completion.
+        waitSocketClose(sock)
+        os._exit(compute_real_exit_code(exit_code))
 
 
 def manager():
@@ -143,29 +87,28 @@ def manager():
     # Create a listening socket on the AF_INET loopback interface
     listen_sock = socket.socket(AF_INET, SOCK_STREAM)
     listen_sock.bind(('127.0.0.1', 0))
-    listen_sock.listen(max(1024, 2 * POOLSIZE, SOMAXCONN))
+    listen_sock.listen(max(1024, SOMAXCONN))
     listen_host, listen_port = listen_sock.getsockname()
     write_int(listen_port, sys.stdout)
 
-    # Launch initial worker pool
-    for idx in range(POOLSIZE):
-        launch_worker(listen_sock)
-    listen_sock.close()
-
-    def shutdown():
-        global exit_flag
-        exit_flag.value = True
+    def shutdown(code):
+        signal.signal(SIGTERM, SIG_DFL)
+        # Send SIGHUP to notify workers of shutdown
+        os.kill(0, SIGHUP)
+        exit(code)
 
-    # Gracefully exit on SIGTERM, don't die on SIGHUP
-    signal.signal(SIGTERM, lambda signum, frame: shutdown())
-    signal.signal(SIGHUP, SIG_IGN)
+    def handle_sigterm(*args):
+        shutdown(1)
+    signal.signal(SIGTERM, handle_sigterm)  # Gracefully exit on SIGTERM
+    signal.signal(SIGHUP, SIG_IGN)  # Don't die on SIGHUP
 
     # Cleanup zombie children
     def handle_sigchld(*args):
         try:
             pid, status = os.waitpid(0, os.WNOHANG)
-            if status != 0 and not should_exit():
-                raise RuntimeError("worker crashed: %s, %s" % (pid, status))
+            if status != 0:
+                msg = "worker %s crashed abruptly with exit status %s" % (pid, status)
+                print >> sys.stderr, msg
         except EnvironmentError as err:
             if err.errno not in (ECHILD, EINTR):
                 raise
@@ -174,20 +117,52 @@ def handle_sigchld(*args):
     # Initialization complete
     sys.stdout.close()
     try:
-        while not should_exit():
+        while True:
             try:
-                # Spark tells us to exit by closing stdin
-                if os.read(0, 512) == '':
-                    shutdown()
-            except EnvironmentError as err:
-                if err.errno != EINTR:
-                    shutdown()
+                ready_fds = select.select([0, listen_sock], [], [])[0]
+            except select.error as ex:
+                if ex[0] == EINTR:
+                    continue
+                else:
                     raise
+            if 0 in ready_fds:
+                try:
+                    worker_pid = read_int(sys.stdin)
+                except EOFError:
+                    # Spark told us to exit by closing stdin
+                    shutdown(0)
+                try:
+                    os.kill(worker_pid, signal.SIGKILL)
+                except OSError:
+                    pass # process already died
+
+
+            if listen_sock in ready_fds:
+                sock, addr = listen_sock.accept()
+                # Launch a worker process
+                try:
+                    pid = os.fork()
+                    if pid == 0:
+                        listen_sock.close()
+                        try:
+                            worker(sock)
+                        except:
+                            traceback.print_exc()
+                            os._exit(1)
+                        else:
+                            os._exit(0)
+                    else:
+                        sock.close()
+
+                except OSError as e:
+                    print >> sys.stderr, "Daemon failed to fork PySpark worker: %s" % e
+                    outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
+                    write_int(-1, outfile)  # Signal that the fork failed
+                    outfile.flush()
+                    outfile.close()
+                    sock.close()
     finally:
-        signal.signal(SIGTERM, SIG_DFL)
-        exit_flag.value = True
-        # Send SIGHUP to notify workers of shutdown
-        os.kill(0, SIGHUP)
+        shutdown(1)
 
 
 if __name__ == '__main__':
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 0dbead4415b02..2c129679f47f3 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -24,6 +24,7 @@
 from threading import Thread
 from py4j.java_gateway import java_import, JavaGateway, GatewayClient
 
+
 def launch_gateway():
     SPARK_HOME = os.environ["SPARK_HOME"]
 
@@ -56,7 +57,7 @@ def preexec_func():
             (stdout, _) = proc.communicate()
             exit_code = proc.poll()
             error_msg = "Launching GatewayServer failed"
-            error_msg += " with exit code %d!" % exit_code if exit_code else "! "
+            error_msg += " with exit code %d! " % exit_code if exit_code else "! "
             error_msg += "(Warning: unexpected output detected.)\n\n"
             error_msg += gateway_port + stdout
             raise Exception(error_msg)
diff --git a/python/pyspark/join.py b/python/pyspark/join.py
index 5f3a7e71f7866..b0f1cc1927066 100644
--- a/python/pyspark/join.py
+++ b/python/pyspark/join.py
@@ -33,10 +33,11 @@
 
 from pyspark.resultiterable import ResultIterable
 
+
 def _do_python_join(rdd, other, numPartitions, dispatch):
     vs = rdd.map(lambda (k, v): (k, (1, v)))
     ws = other.map(lambda (k, v): (k, (2, v)))
-    return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x : dispatch(x.__iter__()))
+    return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__()))
 
 
 def python_join(rdd, other, numPartitions):
@@ -85,6 +86,7 @@ def make_mapper(i):
     vrdds = [rdd.map(make_mapper(i)) for i, rdd in enumerate(rdds)]
     union_vrdds = reduce(lambda acc, other: acc.union(other), vrdds)
     rdd_len = len(vrdds)
+
     def dispatch(seq):
         bufs = [[] for i in range(rdd_len)]
         for (n, v) in seq:
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index e609b60a0f968..9c1565affbdac 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -72,9 +72,9 @@
 # Python interpreter must agree on what endian the machine is.
 
 
-DENSE_VECTOR_MAGIC = 1
+DENSE_VECTOR_MAGIC  = 1
 SPARSE_VECTOR_MAGIC = 2
-DENSE_MATRIX_MAGIC = 3
+DENSE_MATRIX_MAGIC  = 3
 LABELED_POINT_MAGIC = 4
 
 
@@ -97,8 +97,28 @@ def _deserialize_numpy_array(shape, ba, offset, dtype=float64):
     return ar.copy()
 
 
+def _serialize_double(d):
+    """
+    Serialize a double (float or numpy.float64) into a mutually understood format.
+    """
+    if type(d) == float or type(d) == float64 or type(d) == int or type(d) == long:
+        d = float64(d)
+        ba = bytearray(8)
+        _copyto(d, buffer=ba, offset=0, shape=[1], dtype=float64)
+        return ba
+    else:
+        raise TypeError("_serialize_double called on non-float input")
+
+
 def _serialize_double_vector(v):
-    """Serialize a double vector into a mutually understood format.
+    """
+    Serialize a double vector into a mutually understood format.
+
+    Note: we currently do not use a magic byte for double for storage
+    efficiency. This should be reconsidered when we add Ser/De for other
+    8-byte types (e.g. Long), for safety. The corresponding deserializer,
+    _deserialize_double, needs to be modified as well if the serialization
+    scheme changes.
 
     >>> x = array([1,2,3])
     >>> y = _deserialize_double_vector(_serialize_double_vector(x))
@@ -148,6 +168,32 @@ def _serialize_sparse_vector(v):
     return ba
 
 
+def _deserialize_double(ba, offset=0):
+    """Deserialize a double from a mutually understood format.
+
+    >>> import sys
+    >>> _deserialize_double(_serialize_double(123.0)) == 123.0
+    True
+    >>> _deserialize_double(_serialize_double(float64(0.0))) == 0.0
+    True
+    >>> _deserialize_double(_serialize_double(1)) == 1.0
+    True
+    >>> _deserialize_double(_serialize_double(1L)) == 1.0
+    True
+    >>> x = sys.float_info.max
+    >>> _deserialize_double(_serialize_double(sys.float_info.max)) == x
+    True
+    >>> y = float64(sys.float_info.max)
+    >>> _deserialize_double(_serialize_double(sys.float_info.max)) == y
+    True
+    """
+    if type(ba) != bytearray:
+        raise TypeError("_deserialize_double called on a %s; wanted bytearray" % type(ba))
+    if len(ba) - offset != 8:
+        raise TypeError("_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
+    return struct.unpack("d", ba[offset:])[0]
+
+
 def _deserialize_double_vector(ba, offset=0):
     """Deserialize a double vector from a mutually understood format.
 
@@ -164,7 +210,7 @@ def _deserialize_double_vector(ba, offset=0):
     nb = len(ba) - offset
     if nb < 5:
         raise TypeError("_deserialize_double_vector called on a %d-byte array, "
-                "which is too short" % nb)
+                        "which is too short" % nb)
     if ba[offset] == DENSE_VECTOR_MAGIC:
         return _deserialize_dense_vector(ba, offset)
     elif ba[offset] == SPARSE_VECTOR_MAGIC:
@@ -272,6 +318,7 @@ def _serialize_labeled_point(p):
     header_float[0] = p.label
     return header + serialized_features
 
+
 def _deserialize_labeled_point(ba, offset=0):
     """Deserialize a LabeledPoint from a mutually understood format."""
     from pyspark.mllib.regression import LabeledPoint
@@ -283,6 +330,7 @@ def _deserialize_labeled_point(ba, offset=0):
     features = _deserialize_double_vector(ba, offset + 9)
     return LabeledPoint(label, features)
 
+
 def _copyto(array, buffer, offset, shape, dtype):
     """
     Copy the contents of a vector to a destination bytearray at the
@@ -295,22 +343,35 @@ def _copyto(array, buffer, offset, shape, dtype):
     temp_array[...] = array
 
 
-def _get_unmangled_rdd(data, serializer):
+def _get_unmangled_rdd(data, serializer, cache=True):
+    """
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
     dataBytes = data.map(serializer)
     dataBytes._bypass_serializer = True
-    dataBytes.cache()  # TODO: users should unpersist() this later!
+    if cache:
+        dataBytes.cache()
     return dataBytes
 
 
-# Map a pickled Python RDD of Python dense or sparse vectors to a Java RDD of
-# _serialized_double_vectors
-def _get_unmangled_double_vector_rdd(data):
-    return _get_unmangled_rdd(data, _serialize_double_vector)
+def _get_unmangled_double_vector_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of Python dense or sparse vectors to a Java RDD of
+    _serialized_double_vectors.
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    return _get_unmangled_rdd(data, _serialize_double_vector, cache)
 
 
-# Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points
-def _get_unmangled_labeled_point_rdd(data):
-    return _get_unmangled_rdd(data, _serialize_labeled_point)
+def _get_unmangled_labeled_point_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points.
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    return _get_unmangled_rdd(data, _serialize_labeled_point, cache)
 
 
 # Common functions for dealing with and training linear models
@@ -332,7 +393,7 @@ def _linear_predictor_typecheck(x, coeffs):
         if x.size != coeffs.shape[0]:
             raise RuntimeError("Got sparse vector of size %d; wanted %d" % (
                 x.size, coeffs.shape[0]))
-    elif (type(x) == RDD):
+    elif isinstance(x, RDD):
         raise RuntimeError("Bulk predict not yet supported.")
     else:
         raise TypeError("Argument of type " + type(x).__name__ + " unsupported")
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 1c0c536c4fb3d..2bbb9c3fca315 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -63,7 +63,11 @@ class LogisticRegressionModel(LinearModel):
     def predict(self, x):
         _linear_predictor_typecheck(x, self._coeff)
         margin = _dot(x, self._coeff) + self._intercept
-        prob = 1/(1 + exp(-margin))
+        if margin > 0:
+            prob = 1 / (1 + exp(-margin))
+        else:
+            exp_margin = exp(margin)
+            prob = exp_margin / (1 + exp_margin)
         return 1 if prob > 0.5 else 0
 
 
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index db39ed0acdb66..54720c2324ca6 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -247,6 +247,7 @@ def stringify(vector):
         else:
             return "[" + ",".join([str(v) for v in vector]) + "]"
 
+
 def _test():
     import doctest
     (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
@@ -254,4 +255,8 @@ def _test():
         exit(-1)
 
 if __name__ == "__main__":
+    # remove current path from list of search paths to avoid importing mllib.random
+    # for C{import random}, which is done in an external dependency of pyspark during doctests.
+    import sys
+    sys.path.pop(0)
     _test()
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
new file mode 100644
index 0000000000000..36e710dbae7a8
--- /dev/null
+++ b/python/pyspark/mllib/random.py
@@ -0,0 +1,182 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Python package for random data generation.
+"""
+
+
+from pyspark.rdd import RDD
+from pyspark.mllib._common import _deserialize_double, _deserialize_double_vector
+from pyspark.serializers import NoOpSerializer
+
+class RandomRDDGenerators:
+    """
+    Generator methods for creating RDDs comprised of i.i.d samples from
+    some distribution.
+    """
+
+    @staticmethod
+    def uniformRDD(sc, size, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of i.i.d. samples from the
+        uniform distribution on [0.0, 1.0].
+
+        To transform the distribution in the generated RDD from U[0.0, 1.0]
+        to U[a, b], use
+        C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\
+          .map(lambda v: a + (b - a) * v)}
+
+        >>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect()
+        >>> len(x)
+        100
+        >>> max(x) <= 1.0 and min(x) >= 0.0
+        True
+        >>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions()
+        4
+        >>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions()
+        >>> parts == sc.defaultParallelism
+        True
+        """
+        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
+        uniform =  RDD(jrdd, sc, NoOpSerializer())
+        return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
+
+    @staticmethod
+    def normalRDD(sc, size, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of i.i.d samples from the standard normal
+        distribution.
+
+        To transform the distribution in the generated RDD from standard normal
+        to some other normal N(mean, sigma), use
+        C{RandomRDDGenerators.normal(sc, n, p, seed)\
+          .map(lambda v: mean + sigma * v)}
+
+        >>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L)
+        >>> stats = x.stats()
+        >>> stats.count()
+        1000L
+        >>> abs(stats.mean() - 0.0) < 0.1
+        True
+        >>> abs(stats.stdev() - 1.0) < 0.1
+        True
+        """
+        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
+        normal =  RDD(jrdd, sc, NoOpSerializer())
+        return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
+
+    @staticmethod
+    def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of i.i.d samples from the Poisson
+        distribution with the input mean.
+
+        >>> mean = 100.0
+        >>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L)
+        >>> stats = x.stats()
+        >>> stats.count()
+        1000L
+        >>> abs(stats.mean() - mean) < 0.5
+        True
+        >>> from math import sqrt
+        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
+        True
+        """
+        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
+        poisson =  RDD(jrdd, sc, NoOpSerializer())
+        return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
+
+    @staticmethod
+    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of vectors containing i.i.d samples drawn
+        from the uniform distribution on [0.0 1.0].
+
+        >>> import numpy as np
+        >>> mat = np.matrix(RandomRDDGenerators.uniformVectorRDD(sc, 10, 10).collect())
+        >>> mat.shape
+        (10, 10)
+        >>> mat.max() <= 1.0 and mat.min() >= 0.0
+        True
+        >>> RandomRDDGenerators.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
+        4
+        """
+        jrdd = sc._jvm.PythonMLLibAPI() \
+            .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
+        uniform =  RDD(jrdd, sc, NoOpSerializer())
+        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
+
+    @staticmethod
+    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of vectors containing i.i.d samples drawn
+        from the standard normal distribution.
+
+        >>> import numpy as np
+        >>> mat = np.matrix(RandomRDDGenerators.normalVectorRDD(sc, 100, 100, seed=1L).collect())
+        >>> mat.shape
+        (100, 100)
+        >>> abs(mat.mean() - 0.0) < 0.1
+        True
+        >>> abs(mat.std() - 1.0) < 0.1
+        True
+        """
+        jrdd = sc._jvm.PythonMLLibAPI() \
+            .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
+        normal =  RDD(jrdd, sc, NoOpSerializer())
+        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
+
+    @staticmethod
+    def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
+        """
+        Generates an RDD comprised of vectors containing i.i.d samples drawn
+        from the Poisson distribution with the input mean.
+
+        >>> import numpy as np
+        >>> mean = 100.0
+        >>> rdd = RandomRDDGenerators.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
+        >>> mat = np.mat(rdd.collect())
+        >>> mat.shape
+        (100, 100)
+        >>> abs(mat.mean() - mean) < 0.5
+        True
+        >>> from math import sqrt
+        >>> abs(mat.std() - sqrt(mean)) < 0.5
+        True
+        """
+        jrdd = sc._jvm.PythonMLLibAPI() \
+            .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
+        poisson =  RDD(jrdd, sc, NoOpSerializer())
+        return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    globs = globals().copy()
+    # The small batch size here ensures that we see multiple batches,
+    # even in these small test examples:
+    globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index b84bc531dec8c..041b119269427 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -112,12 +112,36 @@ class LinearRegressionModel(LinearRegressionModelBase):
 
 class LinearRegressionWithSGD(object):
     @classmethod
-    def train(cls, data, iterations=100, step=1.0,
-              miniBatchFraction=1.0, initialWeights=None):
-        """Train a linear regression model on the given data."""
+    def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
+              initialWeights=None, regParam=1.0, regType=None, intercept=False):
+        """
+        Train a linear regression model on the given data.
+
+        @param data:              The training data.
+        @param iterations:        The number of iterations (default: 100).
+        @param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        @param miniBatchFraction: Fraction of data to be used for each SGD
+                                  iteration.
+        @param initialWeights:    The initial weights (default: None).
+        @param regParam:          The regularizer parameter (default: 1.0).
+        @param regType:           The type of regularizer used for training
+                                  our model.
+                                  Allowed values: "l1" for using L1Updater,
+                                                  "l2" for using
+                                                       SquaredL2Updater,
+                                                  "none" for no regularizer.
+                                  (default: "none")
+        @param intercept:         Boolean parameter which indicates the use
+                                  or not of the augmented representation for
+                                  training data (i.e. whether bias features
+                                  are activated or not).
+        """
         sc = data.context
+        if regType is None:
+            regType = "none"
         train_f = lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD(
-            d._jrdd, iterations, step, miniBatchFraction, i)
+            d._jrdd, iterations, step, miniBatchFraction, i, regParam, regType, intercept)
         return _regression_train_wrapper(sc, train_f, LinearRegressionModel, data, initialWeights)
 
 
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
new file mode 100644
index 0000000000000..0a08a562d1f1f
--- /dev/null
+++ b/python/pyspark/mllib/stat.py
@@ -0,0 +1,104 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Python package for statistical functions in MLlib.
+"""
+
+from pyspark.mllib._common import \
+    _get_unmangled_double_vector_rdd, _get_unmangled_rdd, \
+    _serialize_double, _serialize_double_vector, \
+    _deserialize_double, _deserialize_double_matrix
+
+class Statistics(object):
+
+    @staticmethod
+    def corr(x, y=None, method=None):
+        """
+        Compute the correlation (matrix) for the input RDD(s) using the
+        specified method.
+        Methods currently supported: I{pearson (default), spearman}.
+
+        If a single RDD of Vectors is passed in, a correlation matrix
+        comparing the columns in the input RDD is returned. Use C{method=}
+        to specify the method to be used for single RDD inout.
+        If two RDDs of floats are passed in, a single float is returned.
+
+        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
+        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
+        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
+        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
+        True
+        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
+        True
+        >>> Statistics.corr(x, y, "spearman")
+        0.5
+        >>> from math import isnan
+        >>> isnan(Statistics.corr(x, zeros))
+        True
+        >>> from linalg import Vectors
+        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
+        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
+        >>> Statistics.corr(rdd)
+        array([[ 1.        ,  0.05564149,         nan,  0.40047142],
+               [ 0.05564149,  1.        ,         nan,  0.91359586],
+               [        nan,         nan,  1.        ,         nan],
+               [ 0.40047142,  0.91359586,         nan,  1.        ]])
+        >>> Statistics.corr(rdd, method="spearman")
+        array([[ 1.        ,  0.10540926,         nan,  0.4       ],
+               [ 0.10540926,  1.        ,         nan,  0.9486833 ],
+               [        nan,         nan,  1.        ,         nan],
+               [ 0.4       ,  0.9486833 ,         nan,  1.        ]])
+        >>> try:
+        ...     Statistics.corr(rdd, "spearman")
+        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
+        ... except TypeError:
+        ...     pass
+        """
+        sc = x.ctx
+        # Check inputs to determine whether a single value or a matrix is needed for output.
+        # Since it's legal for users to use the method name as the second argument, we need to
+        # check if y is used to specify the method name instead.
+        if type(y) == str:
+            raise TypeError("Use 'method=' to specify method name.")
+        if not y:
+            try:
+                Xser = _get_unmangled_double_vector_rdd(x)
+            except TypeError:
+                raise TypeError("corr called on a single RDD not consisted of Vectors.")
+            resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
+            return _deserialize_double_matrix(resultMat)
+        else:
+            xSer = _get_unmangled_rdd(x, _serialize_double)
+            ySer = _get_unmangled_rdd(y, _serialize_double)
+            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
+            return result
+
+
+def _test():
+    import doctest
+    from pyspark import SparkContext
+    globs = globals().copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 37ccf1d590743..9d1e5be637a9a 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -100,6 +100,7 @@ def test_clustering(self):
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(0.0, [1, 0, 0]),
             LabeledPoint(1.0, [0, 1, 1]),
@@ -127,9 +128,19 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[2]) <= 0)
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+        dt_model = \
+            DecisionTree.trainClassifier(rdd, numClasses=2,
+                                         categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
     def test_regression(self):
         from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
             RidgeRegressionWithSGD
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(-1.0, [0, -1]),
             LabeledPoint(1.0, [0, 1]),
@@ -157,6 +168,14 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[2]) <= 0)
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+        dt_model = \
+            DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
 
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
@@ -229,6 +248,7 @@ def test_clustering(self):
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
             LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
@@ -256,9 +276,18 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[2]) <= 0)
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
+                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
     def test_regression(self):
         from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
             RidgeRegressionWithSGD
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
             LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
@@ -286,6 +315,13 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[2]) <= 0)
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
 
 if __name__ == "__main__":
     if not _have_scipy:
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
new file mode 100644
index 0000000000000..1e0006df75ac6
--- /dev/null
+++ b/python/pyspark/mllib/tree.py
@@ -0,0 +1,225 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from py4j.java_collections import MapConverter
+
+from pyspark import SparkContext, RDD
+from pyspark.mllib._common import \
+    _get_unmangled_rdd, _get_unmangled_double_vector_rdd, _serialize_double_vector, \
+    _deserialize_labeled_point, _get_unmangled_labeled_point_rdd, \
+    _deserialize_double
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.serializers import NoOpSerializer
+
+class DecisionTreeModel(object):
+    """
+    A decision tree model for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified for Spark v1.2.
+    """
+
+    def __init__(self, sc, java_model):
+        """
+        :param sc:  Spark context
+        :param java_model:  Handle to Java model object
+        """
+        self._sc = sc
+        self._java_model = java_model
+
+    def __del__(self):
+        self._sc._gateway.detach(self._java_model)
+
+    def predict(self, x):
+        """
+        Predict the label of one or more examples.
+        :param x:  Data point (feature vector),
+                   or an RDD of data points (feature vectors).
+        """
+        pythonAPI = self._sc._jvm.PythonMLLibAPI()
+        if isinstance(x, RDD):
+            # Bulk prediction
+            if x.count() == 0:
+                return self._sc.parallelize([])
+            dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
+            jSerializedPreds = \
+                pythonAPI.predictDecisionTreeModel(self._java_model,
+                                                   dataBytes._jrdd)
+            serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
+            return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
+        else:
+            # Assume x is a single data point.
+            x_ = _serialize_double_vector(x)
+            return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
+
+    def numNodes(self):
+        return self._java_model.numNodes()
+
+    def depth(self):
+        return self._java_model.depth()
+
+    def __str__(self):
+        return self._java_model.toString()
+
+
+class DecisionTree(object):
+    """
+    Learning algorithm for a decision tree model
+    for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified for Spark v1.2.
+
+    Example usage:
+    >>> from numpy import array, ndarray
+    >>> from pyspark.mllib.regression import LabeledPoint
+    >>> from pyspark.mllib.tree import DecisionTree
+    >>> from pyspark.mllib.linalg import SparseVector
+    >>>
+    >>> data = [
+    ...     LabeledPoint(0.0, [0.0]),
+    ...     LabeledPoint(1.0, [1.0]),
+    ...     LabeledPoint(1.0, [2.0]),
+    ...     LabeledPoint(1.0, [3.0])
+    ... ]
+    >>>
+    >>> model = DecisionTree.trainClassifier(sc.parallelize(data), numClasses=2)
+    >>> print(model)
+    DecisionTreeModel classifier
+      If (feature 0 <= 0.5)
+       Predict: 0.0
+      Else (feature 0 > 0.5)
+       Predict: 1.0
+
+    >>> model.predict(array([1.0])) > 0
+    True
+    >>> model.predict(array([0.0])) == 0
+    True
+    >>> sparse_data = [
+    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+    ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
+    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+    ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
+    ... ]
+    >>>
+    >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data))
+    >>> model.predict(array([0.0, 1.0])) == 1
+    True
+    >>> model.predict(array([0.0, 0.0])) == 0
+    True
+    >>> model.predict(SparseVector(2, {1: 1.0})) == 1
+    True
+    >>> model.predict(SparseVector(2, {1: 0.0})) == 0
+    True
+    """
+
+    @staticmethod
+    def trainClassifier(data, numClasses, categoricalFeaturesInfo={},
+                        impurity="gini", maxDepth=4, maxBins=100):
+        """
+        Train a DecisionTreeModel for classification.
+
+        :param data: Training data: RDD of LabeledPoint.
+                     Labels are integers {0,1,...,numClasses}.
+        :param numClasses: Number of classes for classification.
+        :param categoricalFeaturesInfo: Map from categorical feature index
+                                        to number of categories.
+                                        Any feature not in this map
+                                        is treated as continuous.
+        :param impurity: Supported values: "entropy" or "gini"
+        :param maxDepth: Max depth of tree.
+                         E.g., depth 0 means 1 leaf node.
+                         Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each node.
+        :return: DecisionTreeModel
+        """
+        return DecisionTree.train(data, "classification", numClasses,
+                                  categoricalFeaturesInfo,
+                                  impurity, maxDepth, maxBins)
+
+    @staticmethod
+    def trainRegressor(data, categoricalFeaturesInfo={},
+                       impurity="variance", maxDepth=4, maxBins=100):
+        """
+        Train a DecisionTreeModel for regression.
+
+        :param data: Training data: RDD of LabeledPoint.
+                     Labels are real numbers.
+        :param categoricalFeaturesInfo: Map from categorical feature index
+                                        to number of categories.
+                                        Any feature not in this map
+                                        is treated as continuous.
+        :param impurity: Supported values: "variance"
+        :param maxDepth: Max depth of tree.
+                         E.g., depth 0 means 1 leaf node.
+                         Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each node.
+        :return: DecisionTreeModel
+        """
+        return DecisionTree.train(data, "regression", 0,
+                                  categoricalFeaturesInfo,
+                                  impurity, maxDepth, maxBins)
+
+
+    @staticmethod
+    def train(data, algo, numClasses, categoricalFeaturesInfo,
+              impurity, maxDepth, maxBins=100):
+        """
+        Train a DecisionTreeModel for classification or regression.
+
+        :param data: Training data: RDD of LabeledPoint.
+                     For classification, labels are integers
+                      {0,1,...,numClasses}.
+                     For regression, labels are real numbers.
+        :param algo: "classification" or "regression"
+        :param numClasses: Number of classes for classification.
+        :param categoricalFeaturesInfo: Map from categorical feature index
+                                        to number of categories.
+                                        Any feature not in this map
+                                        is treated as continuous.
+        :param impurity: For classification: "entropy" or "gini".
+                         For regression: "variance".
+        :param maxDepth: Max depth of tree.
+                         E.g., depth 0 means 1 leaf node.
+                         Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each node.
+        :return: DecisionTreeModel
+        """
+        sc = data.context
+        dataBytes = _get_unmangled_labeled_point_rdd(data)
+        categoricalFeaturesInfoJMap = \
+            MapConverter().convert(categoricalFeaturesInfo,
+                                   sc._gateway._gateway_client)
+        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
+            dataBytes._jrdd, algo,
+            numClasses, categoricalFeaturesInfoJMap,
+            impurity, maxDepth, maxBins)
+        dataBytes.unpersist()
+        return DecisionTreeModel(sc, model)
+
+
+def _test():
+    import doctest
+    globs = globals().copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index e24c144f458bd..639cda6350229 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -16,6 +16,7 @@
 #
 
 import numpy as np
+import warnings
 
 from pyspark.mllib.linalg import Vectors, SparseVector
 from pyspark.mllib.regression import LabeledPoint
@@ -24,7 +25,6 @@
 from pyspark.serializers import NoOpSerializer
 
 
-
 class MLUtils:
     """
     Helper methods to load, save and pre-process data used in MLlib.
@@ -32,13 +32,16 @@ class MLUtils:
 
     @staticmethod
     def _parse_libsvm_line(line, multiclass):
+        warnings.warn("deprecated", DeprecationWarning)
+        return _parse_libsvm_line(line)
+
+    @staticmethod
+    def _parse_libsvm_line(line):
         """
         Parses a line in LIBSVM format into (label, indices, values).
         """
         items = line.split(None)
         label = float(items[0])
-        if not multiclass:
-            label = 1.0 if label > 0.5 else 0.0
         nnz = len(items) - 1
         indices = np.zeros(nnz, dtype=np.int32)
         values = np.zeros(nnz)
@@ -67,6 +70,11 @@ def _convert_labeled_point_to_libsvm(p):
 
     @staticmethod
     def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+        warnings.warn("deprecated", DeprecationWarning)
+        return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+    @staticmethod
+    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         """
         Loads labeled data in the LIBSVM format into an RDD of
         LabeledPoint. The LIBSVM format is a text-based format used by
@@ -82,13 +90,6 @@ def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=Non
         @param sc: Spark context
         @param path: file or directory path in any Hadoop-supported file
                      system URI
-        @param multiclass: whether the input labels contain more than
-                           two classes. If false, any label with value
-                           greater than 0.5 will be mapped to 1.0, or
-                           0.0 otherwise. So it works for both +1/-1 and
-                           1/0 cases. If true, the double value parsed
-                           directly from the label string will be used
-                           as the label value.
         @param numFeatures: number of features, which will be determined
                             from the input data if a nonpositive value
                             is given. This is useful when the dataset is
@@ -106,7 +107,6 @@ def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=Non
         >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
         >>> tempFile.flush()
         >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
-        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()
         >>> tempFile.close()
         >>> type(examples[0]) == LabeledPoint
         True
@@ -115,20 +115,18 @@ def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=Non
         >>> type(examples[1]) == LabeledPoint
         True
         >>> print examples[1]
-        (0.0,(6,[],[]))
+        (-1.0,(6,[],[]))
         >>> type(examples[2]) == LabeledPoint
         True
         >>> print examples[2]
-        (0.0,(6,[1,3,5],[4.0,5.0,6.0]))
-        >>> multiclass_examples[1].label
-        -1.0
+        (-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
         """
 
         lines = sc.textFile(path, minPartitions)
-        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l, multiclass))
+        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
-            numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
@@ -154,7 +152,6 @@ def saveAsLibSVMFile(data, dir):
         lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
         lines.saveAsTextFile(dir)
 
-
     @staticmethod
     def loadLabeledPoints(sc, path, minPartitions=None):
         """
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index f64f48e3a4c9c..309f5a9b6038d 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -39,15 +39,46 @@
 from pyspark.join import python_join, python_left_outer_join, \
     python_right_outer_join, python_cogroup
 from pyspark.statcounter import StatCounter
-from pyspark.rddsampler import RDDSampler
+from pyspark.rddsampler import RDDSampler, RDDStratifiedSampler
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
+from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, \
+    get_used_memory
 
 from py4j.java_collections import ListConverter, MapConverter
 
 __all__ = ["RDD"]
 
 
+# TODO: for Python 3.3+, PYTHONHASHSEED should be reset to disable randomized
+# hash for string
+def portable_hash(x):
+    """
+    This function returns consistant hash code for builtin types, especially
+    for None and tuple with None.
+
+    The algrithm is similar to that one used by CPython 2.7
+
+    >>> portable_hash(None)
+    0
+    >>> portable_hash((None, 1))
+    219750521
+    """
+    if x is None:
+        return 0
+    if isinstance(x, tuple):
+        h = 0x345678
+        for i in x:
+            h ^= portable_hash(i)
+            h *= 1000003
+            h &= 0xffffffff
+        h ^= len(x)
+        if h == -1:
+            h = -2
+        return h
+    return hash(x)
+
+
 def _extract_concise_traceback():
     """
     This function returns the traceback info for a callsite, returns a dict
@@ -69,16 +100,19 @@ def _extract_concise_traceback():
         file, line, fun, what = tb[0]
         return callsite(function=fun, file=file, linenum=line)
     sfile, sline, sfun, swhat = tb[first_spark_frame]
-    ufile, uline, ufun, uwhat = tb[first_spark_frame-1]
+    ufile, uline, ufun, uwhat = tb[first_spark_frame - 1]
     return callsite(function=sfun, file=ufile, linenum=uline)
 
 _spark_stack_depth = 0
 
+
 class _JavaStackTrace(object):
+
     def __init__(self, sc):
         tb = _extract_concise_traceback()
         if tb is not None:
-            self._traceback = "%s at %s:%s" % (tb.function, tb.file, tb.linenum)
+            self._traceback = "%s at %s:%s" % (
+                tb.function, tb.file, tb.linenum)
         else:
             self._traceback = "Error! Could not extract traceback info"
         self._context = sc
@@ -95,7 +129,9 @@ def __exit__(self, type, value, tb):
         if _spark_stack_depth == 0:
             self._context._jsc.setCallSite(None)
 
+
 class MaxHeapQ(object):
+
     """
     An implementation of MaxHeap.
     >>> import pyspark.rdd
@@ -117,14 +153,14 @@ class MaxHeapQ(object):
     """
 
     def __init__(self, maxsize):
-        # we start from q[1], this makes calculating children as trivial as 2 * k
+        # We start from q[1], so its children are always  2 * k
         self.q = [0]
         self.maxsize = maxsize
 
     def _swim(self, k):
-        while (k > 1) and (self.q[k/2] < self.q[k]):
-            self._swap(k, k/2)
-            k = k/2
+        while (k > 1) and (self.q[k / 2] < self.q[k]):
+            self._swap(k, k / 2)
+            k = k / 2
 
     def _swap(self, i, j):
         t = self.q[i]
@@ -162,7 +198,25 @@ def _replaceRoot(self, value):
             self.q[1] = value
             self._sink(1)
 
+
+def _parse_memory(s):
+    """
+    Parse a memory string in the format supported by Java (e.g. 1g, 200m) and
+    return the value in MB
+
+    >>> _parse_memory("256m")
+    256
+    >>> _parse_memory("2g")
+    2048
+    """
+    units = {'g': 1024, 'm': 1, 't': 1 << 20, 'k': 1.0 / 1024}
+    if s[-1] not in units:
+        raise ValueError("invalid format: " + s)
+    return int(float(s[:-1]) * units[s[-1].lower()])
+
+
 class RDD(object):
+
     """
     A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
     Represents an immutable, partitioned collection of elements that can be
@@ -177,6 +231,13 @@ def __init__(self, jrdd, ctx, jrdd_deserializer):
         self._jrdd_deserializer = jrdd_deserializer
         self._id = jrdd.id()
 
+    def _toPickleSerialization(self):
+        if (self._jrdd_deserializer == PickleSerializer() or
+            self._jrdd_deserializer == BatchedSerializer(PickleSerializer())):
+            return self
+        else:
+            return self._reserialize(BatchedSerializer(PickleSerializer(), 10))
+
     def id(self):
         """
         A unique ID for this RDD (within its SparkContext).
@@ -195,10 +256,10 @@ def context(self):
 
     def cache(self):
         """
-        Persist this RDD with the default storage level (C{MEMORY_ONLY}).
+        Persist this RDD with the default storage level (C{MEMORY_ONLY_SER}).
         """
         self.is_cached = True
-        self._jrdd.cache()
+        self.persist(StorageLevel.MEMORY_ONLY_SER)
         return self
 
     def persist(self, storageLevel):
@@ -257,8 +318,9 @@ def map(self, f, preservesPartitioning=False):
         >>> sorted(rdd.map(lambda x: (x, 1)).collect())
         [('a', 1), ('b', 1), ('c', 1)]
         """
-        def func(split, iterator): return imap(f, iterator)
-        return PipelinedRDD(self, func, preservesPartitioning)
+        def func(_, iterator):
+            return imap(f, iterator)
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
     def flatMap(self, f, preservesPartitioning=False):
         """
@@ -271,7 +333,8 @@ def flatMap(self, f, preservesPartitioning=False):
         >>> sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect())
         [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]
         """
-        def func(s, iterator): return chain.from_iterable(imap(f, iterator))
+        def func(s, iterator):
+            return chain.from_iterable(imap(f, iterator))
         return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
     def mapPartitions(self, f, preservesPartitioning=False):
@@ -283,7 +346,8 @@ def mapPartitions(self, f, preservesPartitioning=False):
         >>> rdd.mapPartitions(f).collect()
         [3, 7]
         """
-        def func(s, iterator): return f(iterator)
+        def func(s, iterator):
+            return f(iterator)
         return self.mapPartitionsWithIndex(func)
 
     def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
@@ -311,17 +375,17 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
         6
         """
         warnings.warn("mapPartitionsWithSplit is deprecated; "
-            "use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2)
+                      "use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2)
         return self.mapPartitionsWithIndex(f, preservesPartitioning)
 
     def getNumPartitions(self):
-      """
-      Returns the number of partitions in RDD
-      >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
-      >>> rdd.getNumPartitions()
-      2
-      """
-      return self._jrdd.partitions().size()
+        """
+        Returns the number of partitions in RDD
+        >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
+        >>> rdd.getNumPartitions()
+        2
+        """
+        return self._jrdd.partitions().size()
 
     def filter(self, f):
         """
@@ -331,7 +395,8 @@ def filter(self, f):
         >>> rdd.filter(lambda x: x % 2 == 0).collect()
         [2, 4]
         """
-        def func(iterator): return ifilter(f, iterator)
+        def func(iterator):
+            return ifilter(f, iterator)
         return self.mapPartitions(func)
 
     def distinct(self):
@@ -353,7 +418,7 @@ def sample(self, withReplacement, fraction, seed=None):
         >>> sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP
         [2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]
         """
-        assert fraction >= 0.0, "Invalid fraction value: %s" % fraction
+        assert fraction >= 0.0, "Negative fraction value: %s" % fraction
         return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)
 
     # this is ported from scala/spark/RDD.scala
@@ -391,9 +456,11 @@ def takeSample(self, withReplacement, num, seed=None):
 
         maxSampleSize = sys.maxint - int(numStDev * sqrt(sys.maxint))
         if num > maxSampleSize:
-            raise ValueError("Sample size cannot be greater than %d." % maxSampleSize)
+            raise ValueError(
+                "Sample size cannot be greater than %d." % maxSampleSize)
 
-        fraction = RDD._computeFractionForSampleSize(num, initialCount, withReplacement)
+        fraction = RDD._computeFractionForSampleSize(
+            num, initialCount, withReplacement)
         samples = self.sample(withReplacement, fraction, seed).collect()
 
         # If the first sample didn't turn out large enough, keep trying to take samples;
@@ -499,17 +566,17 @@ def __add__(self, other):
             raise TypeError
         return self.union(other)
 
-    def sortByKey(self, ascending=True, numPartitions=None, keyfunc = lambda x: x):
+    def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
         """
         Sorts this RDD, which is assumed to consist of (key, value) pairs.
-
+        # noqa
         >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
         >>> sc.parallelize(tmp).sortByKey(True, 2).collect()
         [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
         >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]
         >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])
         >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()
-        [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5), ('little', 4), ('Mary', 1), ('was', 8), ('white', 9), ('whose', 6)]
+        [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)]
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
@@ -521,10 +588,12 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc = lambda x: x):
         # number of (key, value) pairs falling into them
         if numPartitions > 1:
             rddSize = self.count()
-            maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner
+            # constant from Spark's RangePartitioner
+            maxSampleSize = numPartitions * 20.0
             fraction = min(maxSampleSize / max(rddSize, 1), 1.0)
 
-            samples = self.sample(False, fraction, 1).map(lambda (k, v): k).collect()
+            samples = self.sample(False, fraction, 1).map(
+                lambda (k, v): k).collect()
             samples = sorted(samples, reverse=(not ascending), key=keyfunc)
 
             # we have numPartitions many parts but one of the them has
@@ -540,13 +609,13 @@ def rangePartitionFunc(k):
             if ascending:
                 return p
             else:
-                return numPartitions-1-p
+                return numPartitions - 1 - p
 
         def mapFunc(iterator):
             yield sorted(iterator, reverse=(not ascending), key=lambda (k, v): keyfunc(k))
 
         return (self.partitionBy(numPartitions, partitionFunc=rangePartitionFunc)
-                    .mapPartitions(mapFunc,preservesPartitioning=True)
+                    .mapPartitions(mapFunc, preservesPartitioning=True)
                     .flatMap(lambda x: x, preservesPartitioning=True))
 
     def sortBy(self, keyfunc, ascending=True, numPartitions=None):
@@ -570,7 +639,8 @@ def glom(self):
         >>> sorted(rdd.glom().collect())
         [[1, 2], [3, 4]]
         """
-        def func(iterator): yield list(iterator)
+        def func(iterator):
+            yield list(iterator)
         return self.mapPartitions(func)
 
     def cartesian(self, other):
@@ -607,7 +677,9 @@ def pipe(self, command, env={}):
         ['1', '2', '', '3']
         """
         def func(iterator):
-            pipe = Popen(shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
+            pipe = Popen(
+                shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)
+
             def pipe_objs(out):
                 for obj in iterator:
                     out.write(str(obj).rstrip('\n') + '\n')
@@ -646,7 +718,7 @@ def collect(self):
         Return a list that contains all of the elements in this RDD.
         """
         with _JavaStackTrace(self.context) as st:
-          bytesInJava = self._jrdd.collect().iterator()
+            bytesInJava = self._jrdd.collect().iterator()
         return list(self._collect_iterator_through_file(bytesInJava))
 
     def _collect_iterator_through_file(self, iterator):
@@ -736,7 +808,6 @@ def func(iterator):
 
         return self.mapPartitions(func).fold(zeroValue, combOp)
 
-
     def max(self):
         """
         Find the maximum item in this RDD.
@@ -844,6 +915,7 @@ def countPartition(iterator):
             for obj in iterator:
                 counts[obj] += 1
             yield counts
+
         def mergeMaps(m1, m2):
             for (k, v) in m2.iteritems():
                 m1[k] += v
@@ -888,22 +960,22 @@ def takeOrdered(self, num, key=None):
         def topNKeyedElems(iterator, key_=None):
             q = MaxHeapQ(num)
             for k in iterator:
-                if key_ != None:
+                if key_ is not None:
                     k = (key_(k), k)
                 q.insert(k)
             yield q.getElements()
 
         def unKey(x, key_=None):
-            if key_ != None:
+            if key_ is not None:
                 x = [i[1] for i in x]
             return x
 
         def merge(a, b):
             return next(topNKeyedElems(a + b))
-        result = self.mapPartitions(lambda i: topNKeyedElems(i, key)).reduce(merge)
+        result = self.mapPartitions(
+            lambda i: topNKeyedElems(i, key)).reduce(merge)
         return sorted(unKey(result, key), key=key)
 
-
     def take(self, num):
         """
         Take the first num elements of the RDD.
@@ -947,7 +1019,8 @@ def takeUpToNumLeft(iterator):
                     yield next(iterator)
                     taken += 1
 
-            p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
+            p = range(
+                partsScanned, min(partsScanned + numPartsToTry, totalParts))
             res = self.context.runJob(self, takeUpToNumLeft, p, True)
 
             items += res
@@ -964,6 +1037,113 @@ def first(self):
         """
         return self.take(1)[0]
 
+    def saveAsNewAPIHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
+        """
+        Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
+        system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are
+        converted for output using either user specified converters or, by default,
+        L{org.apache.spark.api.python.JavaToWritableConverter}.
+
+        @param conf: Hadoop job configuration, passed in as a dict
+        @param keyConverter: (None by default)
+        @param valueConverter: (None by default)
+        """
+        jconf = self.ctx._dictToJavaMap(conf)
+        pickledRDD = self._toPickleSerialization()
+        batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
+        self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, batched, jconf,
+                                                    keyConverter, valueConverter, True)
+
+    def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,
+                               keyConverter=None, valueConverter=None, conf=None):
+        """
+        Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
+        system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types
+        will be inferred if not specified. Keys and values are converted for output using either
+        user specified converters or L{org.apache.spark.api.python.JavaToWritableConverter}. The
+        C{conf} is applied on top of the base Hadoop conf associated with the SparkContext
+        of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.
+
+        @param path: path to Hadoop file
+        @param outputFormatClass: fully qualified classname of Hadoop OutputFormat
+               (e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
+        @param keyClass: fully qualified classname of key Writable class
+               (e.g. "org.apache.hadoop.io.IntWritable", None by default)
+        @param valueClass: fully qualified classname of value Writable class
+               (e.g. "org.apache.hadoop.io.Text", None by default)
+        @param keyConverter: (None by default)
+        @param valueConverter: (None by default)
+        @param conf: Hadoop job configuration, passed in as a dict (None by default)
+        """
+        jconf = self.ctx._dictToJavaMap(conf)
+        pickledRDD = self._toPickleSerialization()
+        batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
+        self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd, batched, path,
+            outputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf)
+
+    def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
+        """
+        Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
+        system, using the old Hadoop OutputFormat API (mapred package). Keys/values are
+        converted for output using either user specified converters or, by default,
+        L{org.apache.spark.api.python.JavaToWritableConverter}.
+
+        @param conf: Hadoop job configuration, passed in as a dict
+        @param keyConverter: (None by default)
+        @param valueConverter: (None by default)
+        """
+        jconf = self.ctx._dictToJavaMap(conf)
+        pickledRDD = self._toPickleSerialization()
+        batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
+        self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, batched, jconf,
+                                                    keyConverter, valueConverter, False)
+
+    def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,
+                         keyConverter=None, valueConverter=None, conf=None,
+                         compressionCodecClass=None):
+        """
+        Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
+        system, using the old Hadoop OutputFormat API (mapred package). Key and value types
+        will be inferred if not specified. Keys and values are converted for output using either
+        user specified converters or L{org.apache.spark.api.python.JavaToWritableConverter}. The
+        C{conf} is applied on top of the base Hadoop conf associated with the SparkContext
+        of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.
+
+        @param path: path to Hadoop file
+        @param outputFormatClass: fully qualified classname of Hadoop OutputFormat
+               (e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat")
+        @param keyClass: fully qualified classname of key Writable class
+               (e.g. "org.apache.hadoop.io.IntWritable", None by default)
+        @param valueClass: fully qualified classname of value Writable class
+               (e.g. "org.apache.hadoop.io.Text", None by default)
+        @param keyConverter: (None by default)
+        @param valueConverter: (None by default)
+        @param conf: (None by default)
+        @param compressionCodecClass: (None by default)
+        """
+        jconf = self.ctx._dictToJavaMap(conf)
+        pickledRDD = self._toPickleSerialization()
+        batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
+        self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickledRDD._jrdd, batched, path,
+            outputFormatClass, keyClass, valueClass, keyConverter, valueConverter,
+            jconf, compressionCodecClass)
+
+    def saveAsSequenceFile(self, path, compressionCodecClass=None):
+        """
+        Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file
+        system, using the L{org.apache.hadoop.io.Writable} types that we convert from the
+        RDD's key and value types. The mechanism is as follows:
+            1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects.
+            2. Keys and values of this Java RDD are converted to Writables and written out.
+
+        @param path: path to sequence file
+        @param compressionCodecClass: (None by default)
+        """
+        pickledRDD = self._toPickleSerialization()
+        batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
+        self.ctx._jvm.PythonRDD.saveAsSequenceFile(pickledRDD._jrdd, batched,
+                                                   path, compressionCodecClass)
+
     def saveAsPickleFile(self, path, batchSize=10):
         """
         Save this RDD as a SequenceFile of serialized objects. The serializer
@@ -977,7 +1157,7 @@ def saveAsPickleFile(self, path, batchSize=10):
         [1, 2, 'rdd', 'spark']
         """
         self._reserialize(BatchedSerializer(PickleSerializer(),
-                                batchSize))._jrdd.saveAsObjectFile(path)
+                                            batchSize))._jrdd.saveAsObjectFile(path)
 
     def saveAsTextFile(self, path):
         """
@@ -1004,7 +1184,7 @@ def func(split, iterator):
                 if not isinstance(x, basestring):
                     x = unicode(x)
                 yield x.encode("utf-8")
-        keyed = PipelinedRDD(self, func)
+        keyed = self.mapPartitionsWithIndex(func)
         keyed._bypass_serializer = True
         keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
 
@@ -1075,6 +1255,7 @@ def reducePartition(iterator):
             for (k, v) in iterator:
                 m[k] = v if k not in m else func(m[k], v)
             yield m
+
         def mergeMaps(m1, m2):
             for (k, v) in m2.iteritems():
                 m1[k] = v if k not in m1 else func(m1[k], v)
@@ -1144,7 +1325,9 @@ def rightOuterJoin(self, other, numPartitions=None):
         return python_right_outer_join(self, other, numPartitions)
 
     # TODO: add option to control map-side combining
-    def partitionBy(self, numPartitions, partitionFunc=None):
+    # portable_hash is used as default, because builtin hash of None is different
+    # cross machines.
+    def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         """
         Return a copy of the RDD partitioned using the specified partitioner.
 
@@ -1156,31 +1339,60 @@ def partitionBy(self, numPartitions, partitionFunc=None):
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
-        if partitionFunc is None:
-            partitionFunc = lambda x: 0 if x is None else hash(x)
-        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
-        # form the hash buckets in Python, transferring O(numPartitions) objects
-        # to Java.  Each object is a (splitNumber, [objects]) pair.
+        # Transferring O(n) objects to Java is too expensive.
+        # Instead, we'll form the hash buckets in Python,
+        # transferring O(numPartitions) objects to Java.
+        # Each object is a (splitNumber, [objects]) pair.
+        # In order to avoid too huge objects, the objects are
+        # grouped into chunks.
         outputSerializer = self.ctx._unbatched_serializer
+
+        limit = (_parse_memory(self.ctx._conf.get(
+                    "spark.python.worker.memory", "512m")) / 2)
+
         def add_shuffle_key(split, iterator):
 
             buckets = defaultdict(list)
+            c, batch = 0, min(10 * numPartitions, 1000)
 
             for (k, v) in iterator:
                 buckets[partitionFunc(k) % numPartitions].append((k, v))
+                c += 1
+
+                # check used memory and avg size of chunk of objects
+                if (c % 1000 == 0 and get_used_memory() > limit
+                        or c > batch):
+                    n, size = len(buckets), 0
+                    for split in buckets.keys():
+                        yield pack_long(split)
+                        d = outputSerializer.dumps(buckets[split])
+                        del buckets[split]
+                        yield d
+                        size += len(d)
+
+                    avg = (size / n) >> 20
+                    # let 1M < avg < 10M
+                    if avg < 1:
+                        batch *= 1.5
+                    elif avg > 10:
+                        batch = max(batch / 1.5, 1)
+                    c = 0
+
             for (split, items) in buckets.iteritems():
                 yield pack_long(split)
                 yield outputSerializer.dumps(items)
-        keyed = PipelinedRDD(self, add_shuffle_key)
+
+        keyed = self.mapPartitionsWithIndex(add_shuffle_key)
         keyed._bypass_serializer = True
         with _JavaStackTrace(self.context) as st:
-            pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
+            pairRDD = self.ctx._jvm.PairwiseRDD(
+                keyed._jrdd.rdd()).asJavaPairRDD()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                           id(partitionFunc))
         jrdd = pairRDD.partitionBy(partitioner).values()
         rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
-        # This is required so that id(partitionFunc) remains unique, even if
-        # partitionFunc is a lambda:
+        # This is required so that id(partitionFunc) remains unique,
+        # even if partitionFunc is a lambda:
         rdd._partitionFunc = partitionFunc
         return rdd
 
@@ -1213,40 +1425,46 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
+
+        serializer = self.ctx.serializer
+        spill = (self.ctx._conf.get("spark.shuffle.spill", 'True').lower()
+                 == 'true')
+        memory = _parse_memory(self.ctx._conf.get(
+                    "spark.python.worker.memory", "512m"))
+        agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
+
         def combineLocally(iterator):
-            combiners = {}
-            for x in iterator:
-                (k, v) = x
-                if k not in combiners:
-                    combiners[k] = createCombiner(v)
-                else:
-                    combiners[k] = mergeValue(combiners[k], v)
-            return combiners.iteritems()
+            merger = ExternalMerger(agg, memory * 0.9, serializer) \
+                         if spill else InMemoryMerger(agg)
+            merger.mergeValues(iterator)
+            return merger.iteritems()
+
         locally_combined = self.mapPartitions(combineLocally)
         shuffled = locally_combined.partitionBy(numPartitions)
+
         def _mergeCombiners(iterator):
-            combiners = {}
-            for (k, v) in iterator:
-                if not k in combiners:
-                    combiners[k] = v
-                else:
-                    combiners[k] = mergeCombiners(combiners[k], v)
-            return combiners.iteritems()
+            merger = ExternalMerger(agg, memory, serializer) \
+                         if spill else InMemoryMerger(agg)
+            merger.mergeCombiners(iterator)
+            return merger.iteritems()
+
         return shuffled.mapPartitions(_mergeCombiners)
 
     def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
         """
-        Aggregate the values of each key, using given combine functions and a neutral "zero value".
-        This function can return a different result type, U, than the type of the values in this RDD,
-        V. Thus, we need one operation for merging a V into a U and one operation for merging two U's,
-        The former operation is used for merging values within a partition, and the latter is used
-        for merging values between partitions. To avoid memory allocation, both of these functions are
+        Aggregate the values of each key, using given combine functions and a neutral
+        "zero value". This function can return a different result type, U, than the type
+        of the values in this RDD, V. Thus, we need one operation for merging a V into
+        a U and one operation for merging two U's, The former operation is used for merging
+        values within a partition, and the latter is used for merging values between
+        partitions. To avoid memory allocation, both of these functions are
         allowed to modify and return their first argument instead of creating a new U.
         """
         def createZero():
-          return copy.deepcopy(zeroValue)
+            return copy.deepcopy(zeroValue)
 
-        return self.combineByKey(lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions)
+        return self.combineByKey(
+            lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions)
 
     def foldByKey(self, zeroValue, func, numPartitions=None):
         """
@@ -1261,11 +1479,10 @@ def foldByKey(self, zeroValue, func, numPartitions=None):
         [('a', 2), ('b', 1)]
         """
         def createZero():
-          return copy.deepcopy(zeroValue)
+            return copy.deepcopy(zeroValue)
 
         return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions)
 
-
     # TODO: support variant with custom partitioner
     def groupByKey(self, numPartitions=None):
         """
@@ -1289,10 +1506,11 @@ def mergeValue(xs, x):
             return xs
 
         def mergeCombiners(a, b):
-            return a + b
+            a.extend(b)
+            return a
 
         return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
-                numPartitions).mapValues(lambda x: ResultIterable(x))
+                                 numPartitions).mapValues(lambda x: ResultIterable(x))
 
     # TODO: add tests
     def flatMapValues(self, f):
@@ -1352,6 +1570,27 @@ def cogroup(self, other, numPartitions=None):
         """
         return python_cogroup((self, other), numPartitions)
 
+    def sampleByKey(self, withReplacement, fractions, seed=None):
+        """
+        Return a subset of this RDD sampled by key (via stratified sampling).
+        Create a sample of this RDD using variable sampling rates for
+        different keys as specified by fractions, a key to sampling rate map.
+
+        >>> fractions = {"a": 0.2, "b": 0.1}
+        >>> rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000)))
+        >>> sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect())
+        >>> 100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150
+        True
+        >>> max(sample["a"]) <= 999 and min(sample["a"]) >= 0
+        True
+        >>> max(sample["b"]) <= 999 and min(sample["b"]) >= 0
+        True
+        """
+        for fraction in fractions.values():
+            assert fraction >= 0.0, "Negative fraction value: %s" % fraction
+        return self.mapPartitionsWithIndex( \
+            RDDStratifiedSampler(withReplacement, fractions, seed).func, True)
+
     def subtractByKey(self, other, numPartitions=None):
         """
         Return each (key, value) pair in C{self} that has no pair with matching
@@ -1362,7 +1601,8 @@ def subtractByKey(self, other, numPartitions=None):
         >>> sorted(x.subtractByKey(y).collect())
         [('b', 4), ('b', 5)]
         """
-        filter_func = lambda (key, vals): len(vals[0]) > 0 and len(vals[1]) == 0
+        def filter_func((key, vals)):
+            return len(vals[0]) > 0 and len(vals[1]) == 0
         map_func = lambda (key, vals): [(key, val) for val in vals[0]]
         return self.cogroup(other, numPartitions).filter(filter_func).flatMap(map_func)
 
@@ -1375,8 +1615,9 @@ def subtract(self, other, numPartitions=None):
         >>> sorted(x.subtract(y).collect())
         [('a', 1), ('b', 4), ('b', 5)]
         """
-        rdd = other.map(lambda x: (x, True)) # note: here 'True' is just a placeholder
-        return self.map(lambda x: (x, True)).subtractByKey(rdd).map(lambda tpl: tpl[0]) # note: here 'True' is just a placeholder
+        # note: here 'True' is just a placeholder
+        rdd = other.map(lambda x: (x, True))
+        return self.map(lambda x: (x, True)).subtractByKey(rdd).map(lambda tpl: tpl[0])
 
     def keyBy(self, f):
         """
@@ -1434,7 +1675,7 @@ def zip(self, other):
         """
         pairRDD = self._jrdd.zip(other._jrdd)
         deserializer = PairDeserializer(self._jrdd_deserializer,
-                                             other._jrdd_deserializer)
+                                        other._jrdd_deserializer)
         return RDD(pairRDD, self.ctx, deserializer)
 
     def name(self):
@@ -1503,7 +1744,9 @@ def _defaultReducePartitions(self):
     # keys in the pairs.  This could be an expensive operation, since those
     # hashes aren't retained.
 
+
 class PipelinedRDD(RDD):
+
     """
     Pipelined maps:
     >>> rdd = sc.parallelize([1, 2, 3, 4])
@@ -1519,6 +1762,7 @@ class PipelinedRDD(RDD):
     >>> rdd.flatMap(lambda x: [x, x]).reduce(add)
     20
     """
+
     def __init__(self, prev, func, preservesPartitioning=False):
         if not isinstance(prev, PipelinedRDD) or not prev._is_pipelinable():
             # This transformation is the first in its stage:
@@ -1528,6 +1772,7 @@ def __init__(self, prev, func, preservesPartitioning=False):
             self._prev_jrdd_deserializer = prev._jrdd_deserializer
         else:
             prev_func = prev.func
+
             def pipeline_func(split, iterator):
                 return func(split, prev_func(split, iterator))
             self.func = pipeline_func
@@ -1556,15 +1801,15 @@ def _jrdd(self):
             [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
             self.ctx._gateway._gateway_client)
         self.ctx._pickled_broadcast_vars.clear()
-        class_tag = self._prev_jrdd.classTag()
         env = MapConverter().convert(self.ctx.environment,
                                      self.ctx._gateway._gateway_client)
         includes = ListConverter().convert(self.ctx._python_includes,
-                                     self.ctx._gateway._gateway_client)
+                                           self.ctx._gateway._gateway_client)
         python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
-            bytearray(pickled_command), env, includes, self.preservesPartitioning,
-            self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
-            class_tag)
+                                             bytearray(pickled_command),
+                                             env, includes, self.preservesPartitioning,
+                                             self.ctx.pythonExec,
+                                             broadcast_vars, self.ctx._javaAccumulator)
         self._jrdd_val = python_rdd.asJavaRDD()
         return self._jrdd_val
 
@@ -1579,7 +1824,8 @@ def _test():
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
index 845a267e311c5..2df000fdb08ca 100644
--- a/python/pyspark/rddsampler.py
+++ b/python/pyspark/rddsampler.py
@@ -18,18 +18,20 @@
 import sys
 import random
 
-class RDDSampler(object):
-    def __init__(self, withReplacement, fraction, seed=None):
+
+class RDDSamplerBase(object):
+    def __init__(self, withReplacement, seed=None):
         try:
             import numpy
             self._use_numpy = True
         except ImportError:
-            print >> sys.stderr, "NumPy does not appear to be installed. Falling back to default random generator for sampling."
+            print >> sys.stderr, (
+                "NumPy does not appear to be installed. "
+                "Falling back to default random generator for sampling.")
             self._use_numpy = False
 
         self._seed = seed if seed is not None else random.randint(0, sys.maxint)
         self._withReplacement = withReplacement
-        self._fraction = fraction
         self._random = None
         self._split = None
         self._rand_initialized = False
@@ -61,7 +63,7 @@ def getUniformSample(self, split):
     def getPoissonSample(self, split, mean):
         if not self._rand_initialized or split != self._split:
             self.initRandomGenerator(split)
-        
+
         if self._use_numpy:
             return self._random.poisson(mean)
         else:
@@ -80,23 +82,30 @@ def getPoissonSample(self, split, mean):
                 num_arrivals += 1
 
             return (num_arrivals - 1)
-    
+
     def shuffle(self, vals):
-        if self._random == None:
+        if self._random is None:
             self.initRandomGenerator(0)  # this should only ever called on the master so
             # the split does not matter
-        
+
         if self._use_numpy:
             self._random.shuffle(vals)
         else:
             self._random.shuffle(vals, self._random.random)
 
+
+class RDDSampler(RDDSamplerBase):
+    def __init__(self, withReplacement, fraction, seed=None):
+        RDDSamplerBase.__init__(self, withReplacement, seed)
+        self._fraction = fraction
+
     def func(self, split, iterator):
-        if self._withReplacement:            
+        if self._withReplacement:
             for obj in iterator:
-                # For large datasets, the expected number of occurrences of each element in a sample with
-                # replacement is Poisson(frac). We use that to get a count for each element.                                   
-                count = self.getPoissonSample(split, mean = self._fraction)
+                # For large datasets, the expected number of occurrences of each element in
+                # a sample with replacement is Poisson(frac). We use that to get a count for
+                # each element.
+                count = self.getPoissonSample(split, mean=self._fraction)
                 for _ in range(0, count):
                     yield obj
         else:
@@ -104,6 +113,21 @@ def func(self, split, iterator):
                 if self.getUniformSample(split) <= self._fraction:
                     yield obj
 
-            
-            
-            
+class RDDStratifiedSampler(RDDSamplerBase):
+    def __init__(self, withReplacement, fractions, seed=None):
+        RDDSamplerBase.__init__(self, withReplacement, seed)
+        self._fractions = fractions
+
+    def func(self, split, iterator):
+        if self._withReplacement:
+            for key, val in iterator:
+                # For large datasets, the expected number of occurrences of each element in
+                # a sample with replacement is Poisson(frac). We use that to get a count for
+                # each element.
+                count = self.getPoissonSample(split, mean=self._fractions[key])
+                for _ in range(0, count):
+                    yield key, val
+        else:
+            for key, val in iterator:
+                if self.getUniformSample(split) <= self._fractions[key]:
+                    yield key, val
diff --git a/python/pyspark/resultiterable.py b/python/pyspark/resultiterable.py
index 7f418f8d2e29a..df34740fc8176 100644
--- a/python/pyspark/resultiterable.py
+++ b/python/pyspark/resultiterable.py
@@ -19,6 +19,7 @@
 
 import collections
 
+
 class ResultIterable(collections.Iterable):
     """
     A special result iterable. This is used because the standard iterator can not be pickled
@@ -27,7 +28,9 @@ def __init__(self, data):
         self.data = data
         self.index = 0
         self.maxindex = len(data)
+
     def __iter__(self):
         return iter(self.data)
+
     def __len__(self):
         return len(self.data)
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index b253807974a2e..03b31ae9624c2 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -91,7 +91,6 @@ def load_stream(self, stream):
         """
         raise NotImplementedError
 
-
     def _load_stream_without_unbatching(self, stream):
         return self.load_stream(stream)
 
@@ -194,11 +193,11 @@ def load_stream(self, stream):
         return chain.from_iterable(self._load_stream_without_unbatching(stream))
 
     def _load_stream_without_unbatching(self, stream):
-            return self.serializer.load_stream(stream)
+        return self.serializer.load_stream(stream)
 
     def __eq__(self, other):
-        return isinstance(other, BatchedSerializer) and \
-               other.serializer == self.serializer
+        return (isinstance(other, BatchedSerializer) and
+                other.serializer == self.serializer)
 
     def __str__(self):
         return "BatchedSerializer<%s>" % str(self.serializer)
@@ -229,8 +228,8 @@ def load_stream(self, stream):
                 yield pair
 
     def __eq__(self, other):
-        return isinstance(other, CartesianDeserializer) and \
-               self.key_ser == other.key_ser and self.val_ser == other.val_ser
+        return (isinstance(other, CartesianDeserializer) and
+                self.key_ser == other.key_ser and self.val_ser == other.val_ser)
 
     def __str__(self):
         return "CartesianDeserializer<%s, %s>" % \
@@ -252,18 +251,20 @@ def load_stream(self, stream):
                 yield pair
 
     def __eq__(self, other):
-        return isinstance(other, PairDeserializer) and \
-               self.key_ser == other.key_ser and self.val_ser == other.val_ser
+        return (isinstance(other, PairDeserializer) and
+                self.key_ser == other.key_ser and self.val_ser == other.val_ser)
 
     def __str__(self):
-        return "PairDeserializer<%s, %s>" % \
-               (str(self.key_ser), str(self.val_ser))
+        return "PairDeserializer<%s, %s>" % (str(self.key_ser), str(self.val_ser))
 
 
 class NoOpSerializer(FramedSerializer):
 
-    def loads(self, obj): return obj
-    def dumps(self, obj): return obj
+    def loads(self, obj):
+        return obj
+
+    def dumps(self, obj):
+        return obj
 
 
 class PickleSerializer(FramedSerializer):
@@ -276,12 +277,16 @@ class PickleSerializer(FramedSerializer):
     not be as fast as more specialized serializers.
     """
 
-    def dumps(self, obj): return cPickle.dumps(obj, 2)
+    def dumps(self, obj):
+        return cPickle.dumps(obj, 2)
+
     loads = cPickle.loads
 
+
 class CloudPickleSerializer(PickleSerializer):
 
-    def dumps(self, obj): return cloudpickle.dumps(obj, 2)
+    def dumps(self, obj):
+        return cloudpickle.dumps(obj, 2)
 
 
 class MarshalSerializer(FramedSerializer):
@@ -297,6 +302,33 @@ class MarshalSerializer(FramedSerializer):
     loads = marshal.loads
 
 
+class AutoSerializer(FramedSerializer):
+    """
+    Choose marshal or cPickle as serialization protocol autumatically
+    """
+    def __init__(self):
+        FramedSerializer.__init__(self)
+        self._type = None
+
+    def dumps(self, obj):
+        if self._type is not None:
+            return 'P' + cPickle.dumps(obj, -1)
+        try:
+            return 'M' + marshal.dumps(obj)
+        except Exception:
+            self._type = 'P'
+            return 'P' + cPickle.dumps(obj, -1)
+
+    def loads(self, obj):
+        _type = obj[0]
+        if _type == 'M':
+            return marshal.loads(obj[1:])
+        elif _type == 'P':
+            return cPickle.loads(obj[1:])
+        else:
+            raise ValueError("invalid sevialization type: %s" % _type)
+
+
 class UTF8Deserializer(Serializer):
     """
     Deserializes streams written by String.getBytes.
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index ebd714db7a918..e1e7cd954189f 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -35,7 +35,8 @@
 from pyspark.storagelevel import StorageLevel
 
 # this is the equivalent of ADD_JARS
-add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("ADD_FILES") != None else None
+add_files = (os.environ.get("ADD_FILES").split(',')
+             if os.environ.get("ADD_FILES") is not None else None)
 
 if os.environ.get("SPARK_EXECUTOR_URI"):
     SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
@@ -55,7 +56,7 @@
     platform.python_build()[1]))
 print("SparkContext available as sc.")
 
-if add_files != None:
+if add_files is not None:
     print("Adding files: [%s]" % ", ".join(add_files))
 
 # The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP,
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
new file mode 100644
index 0000000000000..e3923d1c36c57
--- /dev/null
+++ b/python/pyspark/shuffle.py
@@ -0,0 +1,439 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import platform
+import shutil
+import warnings
+import gc
+
+from pyspark.serializers import BatchedSerializer, PickleSerializer
+
+try:
+    import psutil
+
+    def get_used_memory():
+        """ Return the used memory in MB """
+        process = psutil.Process(os.getpid())
+        if hasattr(process, "memory_info"):
+            info = process.memory_info()
+        else:
+            info = process.get_memory_info()
+        return info.rss >> 20
+except ImportError:
+
+    def get_used_memory():
+        """ Return the used memory in MB """
+        if platform.system() == 'Linux':
+            for line in open('/proc/self/status'):
+                if line.startswith('VmRSS:'):
+                    return int(line.split()[1]) >> 10
+        else:
+            warnings.warn("Please install psutil to have better "
+                    "support with spilling")
+            if platform.system() == "Darwin":
+                import resource
+                rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+                return rss >> 20
+            # TODO: support windows
+        return 0
+
+
+class Aggregator(object):
+
+    """
+    Aggregator has tree functions to merge values into combiner.
+
+    createCombiner:  (value) -> combiner
+    mergeValue:      (combine, value) -> combiner
+    mergeCombiners:  (combiner, combiner) -> combiner
+    """
+
+    def __init__(self, createCombiner, mergeValue, mergeCombiners):
+        self.createCombiner = createCombiner
+        self.mergeValue = mergeValue
+        self.mergeCombiners = mergeCombiners
+
+
+class SimpleAggregator(Aggregator):
+
+    """
+    SimpleAggregator is useful for the cases that combiners have
+    same type with values
+    """
+
+    def __init__(self, combiner):
+        Aggregator.__init__(self, lambda x: x, combiner, combiner)
+
+
+class Merger(object):
+
+    """
+    Merge shuffled data together by aggregator
+    """
+
+    def __init__(self, aggregator):
+        self.agg = aggregator
+
+    def mergeValues(self, iterator):
+        """ Combine the items by creator and combiner """
+        raise NotImplementedError
+
+    def mergeCombiners(self, iterator):
+        """ Merge the combined items by mergeCombiner """
+        raise NotImplementedError
+
+    def iteritems(self):
+        """ Return the merged items ad iterator """
+        raise NotImplementedError
+
+
+class InMemoryMerger(Merger):
+
+    """
+    In memory merger based on in-memory dict.
+    """
+
+    def __init__(self, aggregator):
+        Merger.__init__(self, aggregator)
+        self.data = {}
+
+    def mergeValues(self, iterator):
+        """ Combine the items by creator and combiner """
+        # speed up attributes lookup
+        d, creator = self.data, self.agg.createCombiner
+        comb = self.agg.mergeValue
+        for k, v in iterator:
+            d[k] = comb(d[k], v) if k in d else creator(v)
+
+    def mergeCombiners(self, iterator):
+        """ Merge the combined items by mergeCombiner """
+        # speed up attributes lookup
+        d, comb = self.data, self.agg.mergeCombiners
+        for k, v in iterator:
+            d[k] = comb(d[k], v) if k in d else v
+
+    def iteritems(self):
+        """ Return the merged items ad iterator """
+        return self.data.iteritems()
+
+
+class ExternalMerger(Merger):
+
+    """
+    External merger will dump the aggregated data into disks when
+    memory usage goes above the limit, then merge them together.
+
+    This class works as follows:
+
+    - It repeatedly combine the items and save them in one dict in 
+      memory.
+
+    - When the used memory goes above memory limit, it will split
+      the combined data into partitions by hash code, dump them
+      into disk, one file per partition.
+
+    - Then it goes through the rest of the iterator, combine items
+      into different dict by hash. Until the used memory goes over
+      memory limit, it dump all the dicts into disks, one file per
+      dict. Repeat this again until combine all the items.
+
+    - Before return any items, it will load each partition and
+      combine them seperately. Yield them before loading next
+      partition.
+
+    - During loading a partition, if the memory goes over limit,
+      it will partition the loaded data and dump them into disks
+      and load them partition by partition again.
+
+    `data` and `pdata` are used to hold the merged items in memory.
+    At first, all the data are merged into `data`. Once the used
+    memory goes over limit, the items in `data` are dumped indo
+    disks, `data` will be cleared, all rest of items will be merged
+    into `pdata` and then dumped into disks. Before returning, all
+    the items in `pdata` will be dumped into disks.
+
+    Finally, if any items were spilled into disks, each partition
+    will be merged into `data` and be yielded, then cleared.
+
+    >>> agg = SimpleAggregator(lambda x, y: x + y)
+    >>> merger = ExternalMerger(agg, 10)
+    >>> N = 10000
+    >>> merger.mergeValues(zip(xrange(N), xrange(N)) * 10)
+    >>> assert merger.spills > 0
+    >>> sum(v for k,v in merger.iteritems())
+    499950000
+
+    >>> merger = ExternalMerger(agg, 10)
+    >>> merger.mergeCombiners(zip(xrange(N), xrange(N)) * 10)
+    >>> assert merger.spills > 0
+    >>> sum(v for k,v in merger.iteritems())
+    499950000
+    """
+
+    # the max total partitions created recursively
+    MAX_TOTAL_PARTITIONS = 4096
+
+    def __init__(self, aggregator, memory_limit=512, serializer=None,
+            localdirs=None, scale=1, partitions=59, batch=1000):
+        Merger.__init__(self, aggregator)
+        self.memory_limit = memory_limit
+        # default serializer is only used for tests
+        self.serializer = serializer or \
+                BatchedSerializer(PickleSerializer(), 1024)
+        self.localdirs = localdirs or self._get_dirs()
+        # number of partitions when spill data into disks
+        self.partitions = partitions
+        # check the memory after # of items merged
+        self.batch = batch
+        # scale is used to scale down the hash of key for recursive hash map
+        self.scale = scale
+        # unpartitioned merged data
+        self.data = {}
+        # partitioned merged data, list of dicts
+        self.pdata = []
+        # number of chunks dumped into disks
+        self.spills = 0
+        # randomize the hash of key, id(o) is the address of o (aligned by 8)
+        self._seed = id(self) + 7
+
+    def _get_dirs(self):
+        """ Get all the directories """
+        path = os.environ.get("SPARK_LOCAL_DIR", "/tmp")
+        dirs = path.split(",")
+        return [os.path.join(d, "python", str(os.getpid()), str(id(self)))
+                for d in dirs]
+
+    def _get_spill_dir(self, n):
+        """ Choose one directory for spill by number n """
+        return os.path.join(self.localdirs[n % len(self.localdirs)], str(n))
+
+    def _next_limit(self):
+        """
+        Return the next memory limit. If the memory is not released
+        after spilling, it will dump the data only when the used memory
+        starts to increase.
+        """
+        return max(self.memory_limit, get_used_memory() * 1.05)
+
+    def mergeValues(self, iterator):
+        """ Combine the items by creator and combiner """
+        iterator = iter(iterator)
+        # speedup attribute lookup
+        creator, comb = self.agg.createCombiner, self.agg.mergeValue
+        d, c, batch = self.data, 0, self.batch
+
+        for k, v in iterator:
+            d[k] = comb(d[k], v) if k in d else creator(v)
+
+            c += 1
+            if c % batch == 0 and get_used_memory() > self.memory_limit:
+                self._spill()
+                self._partitioned_mergeValues(iterator, self._next_limit())
+                break
+
+    def _partition(self, key):
+        """ Return the partition for key """
+        return hash((key, self._seed)) % self.partitions
+
+    def _partitioned_mergeValues(self, iterator, limit=0):
+        """ Partition the items by key, then combine them """
+        # speedup attribute lookup
+        creator, comb = self.agg.createCombiner, self.agg.mergeValue
+        c, pdata, hfun, batch = 0, self.pdata, self._partition, self.batch
+
+        for k, v in iterator:
+            d = pdata[hfun(k)]
+            d[k] = comb(d[k], v) if k in d else creator(v)
+            if not limit:
+                continue
+
+            c += 1
+            if c % batch == 0 and get_used_memory() > limit:
+                self._spill()
+                limit = self._next_limit()
+
+    def mergeCombiners(self, iterator, check=True):
+        """ Merge (K,V) pair by mergeCombiner """
+        iterator = iter(iterator)
+        # speedup attribute lookup
+        d, comb, batch = self.data, self.agg.mergeCombiners, self.batch
+        c = 0
+        for k, v in iterator:
+            d[k] = comb(d[k], v) if k in d else v
+            if not check:
+                continue
+
+            c += 1
+            if c % batch == 0 and get_used_memory() > self.memory_limit:
+                self._spill()
+                self._partitioned_mergeCombiners(iterator, self._next_limit())
+                break
+
+    def _partitioned_mergeCombiners(self, iterator, limit=0):
+        """ Partition the items by key, then merge them """
+        comb, pdata = self.agg.mergeCombiners, self.pdata
+        c, hfun = 0, self._partition
+        for k, v in iterator:
+            d = pdata[hfun(k)]
+            d[k] = comb(d[k], v) if k in d else v
+            if not limit:
+                continue
+
+            c += 1
+            if c % self.batch == 0 and get_used_memory() > limit:
+                self._spill()
+                limit = self._next_limit()
+
+    def _spill(self):
+        """
+        dump already partitioned data into disks.
+
+        It will dump the data in batch for better performance.
+        """
+        path = self._get_spill_dir(self.spills)
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+        if not self.pdata:
+            # The data has not been partitioned, it will iterator the
+            # dataset once, write them into different files, has no
+            # additional memory. It only called when the memory goes
+            # above limit at the first time.
+
+            # open all the files for writing
+            streams = [open(os.path.join(path, str(i)), 'w')
+                       for i in range(self.partitions)]
+
+            for k, v in self.data.iteritems():
+                h = self._partition(k)
+                # put one item in batch, make it compatitable with load_stream
+                # it will increase the memory if dump them in batch
+                self.serializer.dump_stream([(k, v)], streams[h])
+
+            for s in streams:
+                s.close()
+
+            self.data.clear()
+            self.pdata = [{} for i in range(self.partitions)]
+
+        else:
+            for i in range(self.partitions):
+                p = os.path.join(path, str(i))
+                with open(p, "w") as f:
+                    # dump items in batch
+                    self.serializer.dump_stream(self.pdata[i].iteritems(), f)
+                self.pdata[i].clear()
+
+        self.spills += 1
+        gc.collect() # release the memory as much as possible
+
+    def iteritems(self):
+        """ Return all merged items as iterator """
+        if not self.pdata and not self.spills:
+            return self.data.iteritems()
+        return self._external_items()
+
+    def _external_items(self):
+        """ Return all partitioned items as iterator """
+        assert not self.data
+        if any(self.pdata):
+            self._spill()
+        hard_limit = self._next_limit()
+
+        try:
+            for i in range(self.partitions):
+                self.data = {}
+                for j in range(self.spills):
+                    path = self._get_spill_dir(j)
+                    p = os.path.join(path, str(i))
+                    # do not check memory during merging
+                    self.mergeCombiners(self.serializer.load_stream(open(p)),
+                                        False)
+
+                    # limit the total partitions
+                    if (self.scale * self.partitions < self.MAX_TOTAL_PARTITIONS
+                            and j < self.spills - 1
+                            and get_used_memory() > hard_limit):
+                        self.data.clear() # will read from disk again
+                        gc.collect() # release the memory as much as possible
+                        for v in self._recursive_merged_items(i):
+                            yield v
+                        return
+
+                for v in self.data.iteritems():
+                    yield v
+                self.data.clear()
+                gc.collect()
+
+                # remove the merged partition
+                for j in range(self.spills):
+                    path = self._get_spill_dir(j)
+                    os.remove(os.path.join(path, str(i)))
+
+        finally:
+            self._cleanup()
+
+    def _cleanup(self):
+        """ Clean up all the files in disks """
+        for d in self.localdirs:
+            shutil.rmtree(d, True)
+
+    def _recursive_merged_items(self, start):
+        """
+        merge the partitioned items and return the as iterator
+
+        If one partition can not be fit in memory, then them will be
+        partitioned and merged recursively.
+        """
+        # make sure all the data are dumps into disks.
+        assert not self.data
+        if any(self.pdata):
+            self._spill()
+        assert self.spills > 0
+
+        for i in range(start, self.partitions):
+            subdirs = [os.path.join(d, "parts", str(i))
+                            for d in self.localdirs]
+            m = ExternalMerger(self.agg, self.memory_limit, self.serializer,
+                    subdirs, self.scale * self.partitions)
+            m.pdata = [{} for _ in range(self.partitions)]
+            limit = self._next_limit()
+
+            for j in range(self.spills):
+                path = self._get_spill_dir(j)
+                p = os.path.join(path, str(i))
+                m._partitioned_mergeCombiners(
+                        self.serializer.load_stream(open(p)))
+
+                if get_used_memory() > limit:
+                    m._spill()
+                    limit = self._next_limit()
+
+            for v in m._external_items():
+                yield v
+
+            # remove the merged partition
+            for j in range(self.spills):
+                path = self._get_spill_dir(j)
+                os.remove(os.path.join(path, str(i)))
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 5051c82da32a7..1a829c6fafe03 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -15,12 +15,869 @@
 # limitations under the License.
 #
 
+
+import sys
+import types
+import itertools
+import warnings
+import decimal
+import datetime
+import keyword
+import warnings
+from array import array
+from operator import itemgetter
+
 from pyspark.rdd import RDD, PipelinedRDD
-from pyspark.serializers import BatchedSerializer, PickleSerializer
+from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer
+
+from itertools import chain, ifilter, imap
 
 from py4j.protocol import Py4JError
+from py4j.java_collections import ListConverter, MapConverter
+
+
+__all__ = [
+    "StringType", "BinaryType", "BooleanType", "TimestampType", "DecimalType",
+    "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType",
+    "ShortType", "ArrayType", "MapType", "StructField", "StructType",
+    "SQLContext", "HiveContext", "LocalHiveContext", "TestHiveContext",
+    "SchemaRDD", "Row"]
+
+
+class DataType(object):
+    """Spark SQL DataType"""
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        return (isinstance(other, self.__class__) and
+                self.__dict__ == other.__dict__)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+class PrimitiveTypeSingleton(type):
+    """Metaclass for PrimitiveType"""
+
+    _instances = {}
+
+    def __call__(cls):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(PrimitiveTypeSingleton, cls).__call__()
+        return cls._instances[cls]
+
+
+class PrimitiveType(DataType):
+    """Spark SQL PrimitiveType"""
+
+    __metaclass__ = PrimitiveTypeSingleton
+
+    def __eq__(self, other):
+        # because they should be the same object
+        return self is other
+
+
+class StringType(PrimitiveType):
+    """Spark SQL StringType
+
+    The data type representing string values.
+    """
+
+
+class BinaryType(PrimitiveType):
+    """Spark SQL BinaryType
+
+    The data type representing bytearray values.
+    """
+
+
+class BooleanType(PrimitiveType):
+    """Spark SQL BooleanType
+
+    The data type representing bool values.
+    """
+
+
+class TimestampType(PrimitiveType):
+    """Spark SQL TimestampType
+
+    The data type representing datetime.datetime values.
+    """
+
+
+class DecimalType(PrimitiveType):
+    """Spark SQL DecimalType
+
+    The data type representing decimal.Decimal values.
+    """
+
+
+class DoubleType(PrimitiveType):
+    """Spark SQL DoubleType
+
+    The data type representing float values.
+    """
+
+
+class FloatType(PrimitiveType):
+    """Spark SQL FloatType
+
+    The data type representing single precision floating-point values.
+    """
+
+
+class ByteType(PrimitiveType):
+    """Spark SQL ByteType
+
+    The data type representing int values with 1 singed byte.
+    """
+
+
+class IntegerType(PrimitiveType):
+    """Spark SQL IntegerType
+
+    The data type representing int values.
+    """
+
+
+class LongType(PrimitiveType):
+    """Spark SQL LongType
+
+    The data type representing long values. If the any value is
+    beyond the range of [-9223372036854775808, 9223372036854775807],
+    please use DecimalType.
+    """
+
+
+class ShortType(PrimitiveType):
+    """Spark SQL ShortType
+
+    The data type representing int values with 2 signed bytes.
+    """
+
+
+class ArrayType(DataType):
+    """Spark SQL ArrayType
+
+    The data type representing list values. An ArrayType object
+    comprises two fields, elementType (a DataType) and containsNull (a bool).
+    The field of elementType is used to specify the type of array elements.
+    The field of containsNull is used to specify if the array has None values.
+
+    """
+
+    def __init__(self, elementType, containsNull=False):
+        """Creates an ArrayType
+
+        :param elementType: the data type of elements.
+        :param containsNull: indicates whether the list contains None values.
+
+        >>> ArrayType(StringType) == ArrayType(StringType, False)
+        True
+        >>> ArrayType(StringType, True) == ArrayType(StringType)
+        False
+        """
+        self.elementType = elementType
+        self.containsNull = containsNull
+
+    def __str__(self):
+        return "ArrayType(%s,%s)" % (self.elementType,
+               str(self.containsNull).lower())
+
+
+class MapType(DataType):
+    """Spark SQL MapType
+
+    The data type representing dict values. A MapType object comprises
+    three fields, keyType (a DataType), valueType (a DataType) and
+    valueContainsNull (a bool).
+
+    The field of keyType is used to specify the type of keys in the map.
+    The field of valueType is used to specify the type of values in the map.
+    The field of valueContainsNull is used to specify if values of this
+    map has None values.
+
+    For values of a MapType column, keys are not allowed to have None values.
+
+    """
+
+    def __init__(self, keyType, valueType, valueContainsNull=True):
+        """Creates a MapType
+        :param keyType: the data type of keys.
+        :param valueType: the data type of values.
+        :param valueContainsNull: indicates whether values contains
+        null values.
+
+        >>> (MapType(StringType, IntegerType)
+        ...        == MapType(StringType, IntegerType, True))
+        True
+        >>> (MapType(StringType, IntegerType, False)
+        ...        == MapType(StringType, FloatType))
+        False
+        """
+        self.keyType = keyType
+        self.valueType = valueType
+        self.valueContainsNull = valueContainsNull
+
+    def __repr__(self):
+        return "MapType(%s,%s,%s)" % (self.keyType, self.valueType,
+               str(self.valueContainsNull).lower())
+
+
+class StructField(DataType):
+    """Spark SQL StructField
+
+    Represents a field in a StructType.
+    A StructField object comprises three fields, name (a string),
+    dataType (a DataType) and nullable (a bool). The field of name
+    is the name of a StructField. The field of dataType specifies
+    the data type of a StructField.
+
+    The field of nullable specifies if values of a StructField can
+    contain None values.
+
+    """
+
+    def __init__(self, name, dataType, nullable):
+        """Creates a StructField
+        :param name: the name of this field.
+        :param dataType: the data type of this field.
+        :param nullable: indicates whether values of this field
+                         can be null.
+
+        >>> (StructField("f1", StringType, True)
+        ...      == StructField("f1", StringType, True))
+        True
+        >>> (StructField("f1", StringType, True)
+        ...      == StructField("f2", StringType, True))
+        False
+        """
+        self.name = name
+        self.dataType = dataType
+        self.nullable = nullable
+
+    def __repr__(self):
+        return "StructField(%s,%s,%s)" % (self.name, self.dataType,
+               str(self.nullable).lower())
+
+
+class StructType(DataType):
+    """Spark SQL StructType
+
+    The data type representing rows.
+    A StructType object comprises a list of L{StructField}s.
+
+    """
+
+    def __init__(self, fields):
+        """Creates a StructType
+
+        >>> struct1 = StructType([StructField("f1", StringType, True)])
+        >>> struct2 = StructType([StructField("f1", StringType, True)])
+        >>> struct1 == struct2
+        True
+        >>> struct1 = StructType([StructField("f1", StringType, True)])
+        >>> struct2 = StructType([StructField("f1", StringType, True),
+        ...   [StructField("f2", IntegerType, False)]])
+        >>> struct1 == struct2
+        False
+        """
+        self.fields = fields
+
+    def __repr__(self):
+        return ("StructType(List(%s))" %
+                    ",".join(str(field) for field in self.fields))
+
+
+def _parse_datatype_list(datatype_list_string):
+    """Parses a list of comma separated data types."""
+    index = 0
+    datatype_list = []
+    start = 0
+    depth = 0
+    while index < len(datatype_list_string):
+        if depth == 0 and datatype_list_string[index] == ",":
+            datatype_string = datatype_list_string[start:index].strip()
+            datatype_list.append(_parse_datatype_string(datatype_string))
+            start = index + 1
+        elif datatype_list_string[index] == "(":
+            depth += 1
+        elif datatype_list_string[index] == ")":
+            depth -= 1
+
+        index += 1
+
+    # Handle the last data type
+    datatype_string = datatype_list_string[start:index].strip()
+    datatype_list.append(_parse_datatype_string(datatype_string))
+    return datatype_list
+
+
+_all_primitive_types = dict((k, v) for k, v in globals().iteritems()
+    if type(v) is PrimitiveTypeSingleton and v.__base__ == PrimitiveType)
+
+
+def _parse_datatype_string(datatype_string):
+    """Parses the given data type string.
+
+    >>> def check_datatype(datatype):
+    ...     scala_datatype = sqlCtx._ssql_ctx.parseDataType(str(datatype))
+    ...     python_datatype = _parse_datatype_string(
+    ...                          scala_datatype.toString())
+    ...     return datatype == python_datatype
+    >>> all(check_datatype(cls()) for cls in _all_primitive_types.values())
+    True
+    >>> # Simple ArrayType.
+    >>> simple_arraytype = ArrayType(StringType(), True)
+    >>> check_datatype(simple_arraytype)
+    True
+    >>> # Simple MapType.
+    >>> simple_maptype = MapType(StringType(), LongType())
+    >>> check_datatype(simple_maptype)
+    True
+    >>> # Simple StructType.
+    >>> simple_structtype = StructType([
+    ...     StructField("a", DecimalType(), False),
+    ...     StructField("b", BooleanType(), True),
+    ...     StructField("c", LongType(), True),
+    ...     StructField("d", BinaryType(), False)])
+    >>> check_datatype(simple_structtype)
+    True
+    >>> # Complex StructType.
+    >>> complex_structtype = StructType([
+    ...     StructField("simpleArray", simple_arraytype, True),
+    ...     StructField("simpleMap", simple_maptype, True),
+    ...     StructField("simpleStruct", simple_structtype, True),
+    ...     StructField("boolean", BooleanType(), False)])
+    >>> check_datatype(complex_structtype)
+    True
+    >>> # Complex ArrayType.
+    >>> complex_arraytype = ArrayType(complex_structtype, True)
+    >>> check_datatype(complex_arraytype)
+    True
+    >>> # Complex MapType.
+    >>> complex_maptype = MapType(complex_structtype,
+    ...                           complex_arraytype, False)
+    >>> check_datatype(complex_maptype)
+    True
+    """
+    index = datatype_string.find("(")
+    if index == -1:
+        # It is a primitive type.
+        index = len(datatype_string)
+    type_or_field = datatype_string[:index]
+    rest_part = datatype_string[index + 1:len(datatype_string) - 1].strip()
+
+    if type_or_field in _all_primitive_types:
+        return _all_primitive_types[type_or_field]()
+
+    elif type_or_field == "ArrayType":
+        last_comma_index = rest_part.rfind(",")
+        containsNull = True
+        if rest_part[last_comma_index + 1:].strip().lower() == "false":
+            containsNull = False
+        elementType = _parse_datatype_string(
+            rest_part[:last_comma_index].strip())
+        return ArrayType(elementType, containsNull)
+
+    elif type_or_field == "MapType":
+        last_comma_index = rest_part.rfind(",")
+        valueContainsNull = True
+        if rest_part[last_comma_index + 1:].strip().lower() == "false":
+            valueContainsNull = False
+        keyType, valueType = _parse_datatype_list(
+            rest_part[:last_comma_index].strip())
+        return MapType(keyType, valueType, valueContainsNull)
+
+    elif type_or_field == "StructField":
+        first_comma_index = rest_part.find(",")
+        name = rest_part[:first_comma_index].strip()
+        last_comma_index = rest_part.rfind(",")
+        nullable = True
+        if rest_part[last_comma_index + 1:].strip().lower() == "false":
+            nullable = False
+        dataType = _parse_datatype_string(
+            rest_part[first_comma_index + 1:last_comma_index].strip())
+        return StructField(name, dataType, nullable)
+
+    elif type_or_field == "StructType":
+        # rest_part should be in the format like
+        # List(StructField(field1,IntegerType,false)).
+        field_list_string = rest_part[rest_part.find("(") + 1:-1]
+        fields = _parse_datatype_list(field_list_string)
+        return StructType(fields)
+
+
+# Mapping Python types to Spark SQL DateType
+_type_mappings = {
+    bool: BooleanType,
+    int: IntegerType,
+    long: LongType,
+    float: DoubleType,
+    str: StringType,
+    unicode: StringType,
+    decimal.Decimal: DecimalType,
+    datetime.datetime: TimestampType,
+    datetime.date: TimestampType,
+    datetime.time: TimestampType,
+}
+
+
+def _infer_type(obj):
+    """Infer the DataType from obj"""
+    if obj is None:
+        raise ValueError("Can not infer type for None")
+
+    dataType = _type_mappings.get(type(obj))
+    if dataType is not None:
+        return dataType()
+
+    if isinstance(obj, dict):
+        if not obj:
+            raise ValueError("Can not infer type for empty dict")
+        key, value = obj.iteritems().next()
+        return MapType(_infer_type(key), _infer_type(value), True)
+    elif isinstance(obj, (list, array)):
+        if not obj:
+            raise ValueError("Can not infer type for empty list/array")
+        return ArrayType(_infer_type(obj[0]), True)
+    else:
+        try:
+            return _infer_schema(obj)
+        except ValueError:
+            raise ValueError("not supported type: %s" % type(obj))
+
+
+def _infer_schema(row):
+    """Infer the schema from dict/namedtuple/object"""
+    if isinstance(row, dict):
+        items = sorted(row.items())
+
+    elif isinstance(row, tuple):
+        if hasattr(row, "_fields"): # namedtuple
+            items = zip(row._fields, tuple(row))
+        elif hasattr(row, "__FIELDS__"): # Row
+            items = zip(row.__FIELDS__, tuple(row))
+        elif all(isinstance(x, tuple) and len(x) == 2 for x in row):
+            items = row
+        else:
+            raise ValueError("Can't infer schema from tuple")
+
+    elif hasattr(row, "__dict__"): # object
+        items = sorted(row.__dict__.items())
+
+    else:
+        raise ValueError("Can not infer schema for type: %s" % type(row))
+
+    fields = [StructField(k, _infer_type(v), True) for k, v in items]
+    return StructType(fields)
+
+
+def _create_converter(obj, dataType):
+    """Create an converter to drop the names of fields in obj """
+    if not _has_struct(dataType):
+        return lambda x: x
+
+    elif isinstance(dataType, ArrayType):
+        conv = _create_converter(obj[0], dataType.elementType)
+        return lambda row: map(conv, row)
+
+    elif isinstance(dataType, MapType):
+        value = obj.values()[0]
+        conv = _create_converter(value, dataType.valueType)
+        return lambda row: dict((k, conv(v)) for k, v in row.iteritems())
+
+    # dataType must be StructType
+    names = [f.name for f in dataType.fields]
+
+    if isinstance(obj, dict):
+        conv = lambda o: tuple(o.get(n) for n in names)
+
+    elif isinstance(obj, tuple):
+        if hasattr(obj, "_fields"): # namedtuple
+            conv = tuple
+        elif hasattr(obj, "__FIELDS__"):
+            conv = tuple
+        elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):
+            conv = lambda o: tuple(v for k, v in o)
+        else:
+            raise ValueError("unexpected tuple")
+
+    elif hasattr(obj, "__dict__"): # object
+        conv = lambda o: [o.__dict__.get(n, None) for n in names]
+
+    nested = any(_has_struct(f.dataType) for f in dataType.fields)
+    if not nested:
+        return conv
+
+    row = conv(obj)
+    convs = [_create_converter(v, f.dataType)
+             for v, f in zip(row, dataType.fields)]
 
-__all__ = ["SQLContext", "HiveContext", "LocalHiveContext", "TestHiveContext", "SchemaRDD", "Row"]
+    def nested_conv(row):
+        return tuple(f(v) for f, v in zip(convs, conv(row)))
+
+    return nested_conv
+
+
+def _drop_schema(rows, schema):
+    """ all the names of fields, becoming tuples"""
+    iterator = iter(rows)
+    row = iterator.next()
+    converter = _create_converter(row, schema)
+    yield converter(row)
+    for i in iterator:
+        yield converter(i)
+
+
+_BRACKETS = {'(': ')', '[': ']', '{': '}'}
+
+
+def _split_schema_abstract(s):
+    """
+    split the schema abstract into fields
+
+    >>> _split_schema_abstract("a b  c")
+    ['a', 'b', 'c']
+    >>> _split_schema_abstract("a(a b)")
+    ['a(a b)']
+    >>> _split_schema_abstract("a b[] c{a b}")
+    ['a', 'b[]', 'c{a b}']
+    >>> _split_schema_abstract(" ")
+    []
+    """
+
+    r = []
+    w = ''
+    brackets = []
+    for c in s:
+        if c == ' ' and not brackets:
+            if w:
+                r.append(w)
+            w = ''
+        else:
+            w += c
+            if c in _BRACKETS:
+                brackets.append(c)
+            elif c in _BRACKETS.values():
+                if not brackets or c != _BRACKETS[brackets.pop()]:
+                    raise ValueError("unexpected " + c)
+
+    if brackets:
+        raise ValueError("brackets not closed: %s" % brackets)
+    if w:
+        r.append(w)
+    return r
+
+
+def _parse_field_abstract(s):
+    """
+    Parse a field in schema abstract
+
+    >>> _parse_field_abstract("a")
+    StructField(a,None,true)
+    >>> _parse_field_abstract("b(c d)")
+    StructField(b,StructType(...c,None,true),StructField(d...
+    >>> _parse_field_abstract("a[]")
+    StructField(a,ArrayType(None,true),true)
+    >>> _parse_field_abstract("a{[]}")
+    StructField(a,MapType(None,ArrayType(None,true),true),true)
+    """
+    if set(_BRACKETS.keys()) & set(s):
+        idx = min((s.index(c) for c in _BRACKETS if c in s))
+        name = s[:idx]
+        return StructField(name, _parse_schema_abstract(s[idx:]), True)
+    else:
+        return StructField(s, None, True)
+
+
+def _parse_schema_abstract(s):
+    """
+    parse abstract into schema
+
+    >>> _parse_schema_abstract("a b  c")
+    StructType...a...b...c...
+    >>> _parse_schema_abstract("a[b c] b{}")
+    StructType...a,ArrayType...b...c...b,MapType...
+    >>> _parse_schema_abstract("c{} d{a b}")
+    StructType...c,MapType...d,MapType...a...b...
+    >>> _parse_schema_abstract("a b(t)").fields[1]
+    StructField(b,StructType(List(StructField(t,None,true))),true)
+    """
+    s = s.strip()
+    if not s:
+        return
+
+    elif s.startswith('('):
+        return _parse_schema_abstract(s[1:-1])
+
+    elif s.startswith('['):
+        return ArrayType(_parse_schema_abstract(s[1:-1]), True)
+
+    elif s.startswith('{'):
+        return MapType(None, _parse_schema_abstract(s[1:-1]))
+
+    parts = _split_schema_abstract(s)
+    fields = [_parse_field_abstract(p) for p in parts]
+    return StructType(fields)
+
+
+def _infer_schema_type(obj, dataType):
+    """
+    Fill the dataType with types infered from obj
+
+    >>> schema = _parse_schema_abstract("a b c")
+    >>> row = (1, 1.0, "str")
+    >>> _infer_schema_type(row, schema)
+    StructType...IntegerType...DoubleType...StringType...
+    >>> row = [[1], {"key": (1, 2.0)}]
+    >>> schema = _parse_schema_abstract("a[] b{c d}")
+    >>> _infer_schema_type(row, schema)
+    StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
+    """
+    if dataType is None:
+        return _infer_type(obj)
+
+    if not obj:
+        raise ValueError("Can not infer type from empty value")
+
+    if isinstance(dataType, ArrayType):
+        eType = _infer_schema_type(obj[0], dataType.elementType)
+        return ArrayType(eType, True)
+
+    elif isinstance(dataType, MapType):
+        k, v = obj.iteritems().next()
+        return MapType(_infer_type(k),
+                       _infer_schema_type(v, dataType.valueType))
+
+    elif isinstance(dataType, StructType):
+        fs = dataType.fields
+        assert len(fs) == len(obj), \
+            "Obj(%s) have different length with fields(%s)" % (obj, fs)
+        fields = [StructField(f.name, _infer_schema_type(o, f.dataType), True)
+                    for o, f in zip(obj, fs)]
+        return StructType(fields)
+
+    else:
+        raise ValueError("Unexpected dataType: %s" % dataType)
+
+
+_acceptable_types = {
+    BooleanType: (bool,),
+    ByteType: (int, long),
+    ShortType: (int, long),
+    IntegerType: (int, long),
+    LongType: (int, long),
+    FloatType: (float,),
+    DoubleType: (float,),
+    DecimalType: (decimal.Decimal,),
+    StringType: (str, unicode),
+    TimestampType: (datetime.datetime, datetime.time, datetime.date),
+    ArrayType: (list, tuple, array),
+    MapType: (dict,),
+    StructType: (tuple, list),
+}
+
+def _verify_type(obj, dataType):
+    """
+    Verify the type of obj against dataType, raise an exception if
+    they do not match.
+
+    >>> _verify_type(None, StructType([]))
+    >>> _verify_type("", StringType())
+    >>> _verify_type(0, IntegerType())
+    >>> _verify_type(range(3), ArrayType(ShortType()))
+    >>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+        ...
+    TypeError:...
+    >>> _verify_type({}, MapType(StringType(), IntegerType()))
+    >>> _verify_type((), StructType([]))
+    >>> _verify_type([], StructType([]))
+    >>> _verify_type([1], StructType([])) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+        ...
+    ValueError:...
+    """
+    # all objects are nullable
+    if obj is None:
+        return
+
+    _type = type(dataType)
+    if _type not in _acceptable_types:
+        return
+
+    if type(obj) not in _acceptable_types[_type]:
+        raise TypeError("%s can not accept abject in type %s"
+                        % (dataType, type(obj)))
+
+    if isinstance(dataType, ArrayType):
+        for i in obj:
+            _verify_type(i, dataType.elementType)
+
+    elif isinstance(dataType, MapType):
+        for k, v in obj.iteritems():
+            _verify_type(k, dataType.keyType)
+            _verify_type(v, dataType.valueType)
+
+    elif isinstance(dataType, StructType):
+        if len(obj) != len(dataType.fields):
+            raise ValueError("Length of object (%d) does not match with"
+                "length of fields (%d)" % (len(obj), len(dataType.fields)))
+        for v, f in zip(obj, dataType.fields):
+            _verify_type(v, f.dataType)
+
+
+_cached_cls = {}
+
+
+def _restore_object(dataType, obj):
+    """ Restore object during unpickling. """
+    # use id(dataType) as key to speed up lookup in dict
+    # Because of batched pickling, dataType will be the
+    # same object in mose cases.
+    k = id(dataType)
+    cls = _cached_cls.get(k)
+    if cls is None:
+        # use dataType as key to avoid create multiple class
+        cls = _cached_cls.get(dataType)
+        if cls is None:
+            cls = _create_cls(dataType)
+            _cached_cls[dataType] = cls
+        _cached_cls[k] = cls
+    return cls(obj)
+
+
+def _create_object(cls, v):
+    """ Create an customized object with class `cls`. """
+    return cls(v) if v is not None else v
+
+
+def _create_getter(dt, i):
+    """ Create a getter for item `i` with schema """
+    cls = _create_cls(dt)
+
+    def getter(self):
+        return _create_object(cls, self[i])
+
+    return getter
+
+
+def _has_struct(dt):
+    """Return whether `dt` is or has StructType in it"""
+    if isinstance(dt, StructType):
+        return True
+    elif isinstance(dt, ArrayType):
+        return _has_struct(dt.elementType)
+    elif isinstance(dt, MapType):
+        return _has_struct(dt.valueType)
+    return False
+
+
+def _create_properties(fields):
+    """Create properties according to fields"""
+    ps = {}
+    for i, f in enumerate(fields):
+        name = f.name
+        if (name.startswith("__") and name.endswith("__")
+                or keyword.iskeyword(name)):
+            warnings.warn("field name %s can not be accessed in Python,"
+                          "use position to access it instead" % name)
+        if _has_struct(f.dataType):
+            # delay creating object until accessing it
+            getter = _create_getter(f.dataType, i)
+        else:
+            getter = itemgetter(i)
+        ps[name] = property(getter)
+    return ps
+
+
+def _create_cls(dataType):
+    """
+    Create an class by dataType
+
+    The created class is similar to namedtuple, but can have nested schema.
+
+    >>> schema = _parse_schema_abstract("a b c")
+    >>> row = (1, 1.0, "str")
+    >>> schema = _infer_schema_type(row, schema)
+    >>> obj = _create_cls(schema)(row)
+    >>> import pickle
+    >>> pickle.loads(pickle.dumps(obj))
+    Row(a=1, b=1.0, c='str')
+
+    >>> row = [[1], {"key": (1, 2.0)}]
+    >>> schema = _parse_schema_abstract("a[] b{c d}")
+    >>> schema = _infer_schema_type(row, schema)
+    >>> obj = _create_cls(schema)(row)
+    >>> pickle.loads(pickle.dumps(obj))
+    Row(a=[1], b={'key': Row(c=1, d=2.0)})
+    """
+
+    if isinstance(dataType, ArrayType):
+        cls = _create_cls(dataType.elementType)
+
+        class List(list):
+
+            def __getitem__(self, i):
+                # create object with datetype
+                return _create_object(cls, list.__getitem__(self, i))
+
+            def __repr__(self):
+                # call collect __repr__ for nested objects
+                return "[%s]" % (", ".join(repr(self[i])
+                                           for i in range(len(self))))
+
+            def __reduce__(self):
+                return list.__reduce__(self)
+
+        return List
+
+    elif isinstance(dataType, MapType):
+        vcls = _create_cls(dataType.valueType)
+
+        class Dict(dict):
+
+            def __getitem__(self, k):
+                # create object with datetype
+                return _create_object(vcls, dict.__getitem__(self, k))
+
+            def __repr__(self):
+                # call collect __repr__ for nested objects
+                return "{%s}" % (", ".join("%r: %r" % (k, self[k])
+                                           for k in self))
+
+            def __reduce__(self):
+                return dict.__reduce__(self)
+
+        return Dict
+
+    elif not isinstance(dataType, StructType):
+        raise Exception("unexpected data type: %s" % dataType)
+
+    class Row(tuple):
+        """ Row in SchemaRDD """
+        __DATATYPE__ = dataType
+        __FIELDS__ = tuple(f.name for f in dataType.fields)
+        __slots__ = ()
+
+        # create property for fast access
+        locals().update(_create_properties(dataType.fields))
+
+        def __repr__(self):
+            # call collect __repr__ for nested objects
+            return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
+                    for n in self.__FIELDS__))
+
+        def __reduce__(self):
+            return (_restore_object, (self.__DATATYPE__, tuple(self)))
+
+    return Row
 
 
 class SQLContext:
@@ -30,7 +887,7 @@ class SQLContext:
     tables, execute SQL over tables, cache tables, and read parquet files.
     """
 
-    def __init__(self, sparkContext, sqlContext = None):
+    def __init__(self, sparkContext, sqlContext=None):
         """Create a new SQLContext.
 
         @param sparkContext: The SparkContext to wrap.
@@ -39,7 +896,7 @@ def __init__(self, sparkContext, sqlContext = None):
         >>> sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL
         Traceback (most recent call last):
             ...
-        ValueError:...
+        TypeError:...
 
         >>> bad_rdd = sc.parallelize([1,2,3])
         >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL
@@ -47,17 +904,23 @@ def __init__(self, sparkContext, sqlContext = None):
             ...
         ValueError:...
 
-        >>> allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L,
-        ... "boolean" : True}])
-        >>> srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (x.int, x.string, x.double, x.long,
-        ... x.boolean))
-        >>> srdd.collect()[0]
-        (1, u'string', 1.0, 1, True)
+        >>> from datetime import datetime
+        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1L,
+        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
+        ...     time=datetime(2014, 8, 1, 14, 1, 5))])
+        >>> srdd = sqlCtx.inferSchema(allTypes)
+        >>> srdd.registerTempTable("allTypes")
+        >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
+        ...            'from allTypes where b and i > 0').collect()
+        [Row(c0=2, c1=2.0, c2=False, c3=2, c4=0...8, 1, 14, 1, 5), a=1)]
+        >>> srdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time,
+        ...                     x.row.a, x.list)).collect()
+        [(1, u'string', 1.0, 1, True, ...(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
         """
         self._sc = sparkContext
         self._jsc = self._sc._jsc
         self._jvm = self._sc._jvm
-        self._pythonToJavaMap = self._jvm.PythonRDD.pythonToJavaMap
+        self._pythonToJava = self._jvm.PythonRDD.pythonToJavaArray
 
         if sqlContext:
             self._scala_SQLContext = sqlContext
@@ -73,38 +936,157 @@ def _ssql_ctx(self):
             self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
         return self._scala_SQLContext
 
+    def registerFunction(self, name, f, returnType=StringType()):
+        """Registers a lambda function as a UDF so it can be used in SQL statements.
+
+        In addition to a name and the function itself, the return type can be optionally specified.
+        When the return type is not given it default to a string and conversion will automatically
+        be done.  For any other return type, the produced object must match the specified type.
+
+        >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x))
+        >>> sqlCtx.sql("SELECT stringLengthString('test')").collect()
+        [Row(c0=u'4')]
+        >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect()
+        [Row(c0=4)]
+        >>> sqlCtx.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType())
+        >>> sqlCtx.sql("SELECT twoArgs('test', 1)").collect()
+        [Row(c0=5)]
+        """
+        func = lambda _, it: imap(lambda x: f(*x), it)
+        command = (func,
+                   BatchedSerializer(PickleSerializer(), 1024),
+                   BatchedSerializer(PickleSerializer(), 1024))
+        env = MapConverter().convert(self._sc.environment,
+                                     self._sc._gateway._gateway_client)
+        includes = ListConverter().convert(self._sc._python_includes,
+                                     self._sc._gateway._gateway_client)
+        self._ssql_ctx.registerPython(name,
+                                      bytearray(CloudPickleSerializer().dumps(command)),
+                                      env,
+                                      includes,
+                                      self._sc.pythonExec,
+                                      self._sc._javaAccumulator,
+                                      str(returnType))
+
     def inferSchema(self, rdd):
-        """Infer and apply a schema to an RDD of L{dict}s.
+        """Infer and apply a schema to an RDD of L{Row}s.
+
+        We peek at the first row of the RDD to determine the fields' names
+        and types. Nested collections are supported, which include array,
+        dict, list, Row, tuple, namedtuple, or object.
+
+        All the rows in `rdd` should have the same type with the first one,
+        or it will cause runtime exceptions.
 
-        We peek at the first row of the RDD to determine the fields names
-        and types, and then use that to extract all the dictionaries. Nested
-        collections are supported, which include array, dict, list, set, and
-        tuple.
+        Each row could be L{pyspark.sql.Row} object or namedtuple or objects,
+        using dict is deprecated.
 
+        >>> rdd = sc.parallelize(
+        ...     [Row(field1=1, field2="row1"),
+        ...      Row(field1=2, field2="row2"),
+        ...      Row(field1=3, field2="row3")])
         >>> srdd = sqlCtx.inferSchema(rdd)
-        >>> srdd.collect() == [{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"},
-        ...                    {"field1" : 3, "field2": "row3"}]
-        True
+        >>> srdd.collect()[0]
+        Row(field1=1, field2=u'row1')
 
-        >>> from array import array
+        >>> NestedRow = Row("f1", "f2")
+        >>> nestedRdd1 = sc.parallelize([
+        ...     NestedRow(array('i', [1, 2]), {"row1": 1.0}),
+        ...     NestedRow(array('i', [2, 3]), {"row2": 2.0})])
         >>> srdd = sqlCtx.inferSchema(nestedRdd1)
-        >>> srdd.collect() == [{"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
-        ...                    {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]
-        True
+        >>> srdd.collect()
+        [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]
 
+        >>> nestedRdd2 = sc.parallelize([
+        ...     NestedRow([[1, 2], [2, 3]], [1, 2]),
+        ...     NestedRow([[2, 3], [3, 4]], [2, 3])])
         >>> srdd = sqlCtx.inferSchema(nestedRdd2)
-        >>> srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
-        ...                    {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]
-        True
+        >>> srdd.collect()
+        [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]
         """
-        if (rdd.__class__ is SchemaRDD):
-            raise ValueError("Cannot apply schema to %s" % SchemaRDD.__name__)
-        elif not isinstance(rdd.first(), dict):
-            raise ValueError("Only RDDs with dictionaries can be converted to %s: %s" %
-                             (SchemaRDD.__name__, rdd.first()))
 
-        jrdd = self._pythonToJavaMap(rdd._jrdd)
-        srdd = self._ssql_ctx.inferSchema(jrdd.rdd())
+        if isinstance(rdd, SchemaRDD):
+            raise TypeError("Cannot apply schema to SchemaRDD")
+
+        first = rdd.first()
+        if not first:
+            raise ValueError("The first row in RDD is empty, "
+                    "can not infer schema")
+        if type(first) is dict:
+            warnings.warn("Using RDD of dict to inferSchema is deprecated")
+
+        schema = _infer_schema(first)
+        rdd = rdd.mapPartitions(lambda rows: _drop_schema(rows, schema))
+        return self.applySchema(rdd, schema)
+
+    def applySchema(self, rdd, schema):
+        """
+        Applies the given schema to the given RDD of L{tuple} or L{list}s.
+
+        These tuples or lists can contain complex nested structures like
+        lists, maps or nested rows.
+
+        The schema should be a StructType.
+
+        It is important that the schema matches the types of the objects
+        in each row or exceptions could be thrown at runtime.
+
+        >>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
+        >>> schema = StructType([StructField("field1", IntegerType(), False),
+        ...     StructField("field2", StringType(), False)])
+        >>> srdd = sqlCtx.applySchema(rdd2, schema)
+        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
+        >>> srdd2 = sqlCtx.sql("SELECT * from table1")
+        >>> srdd2.collect()
+        [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
+
+        >>> from datetime import datetime
+        >>> rdd = sc.parallelize([(127, -32768, 1.0,
+        ...     datetime(2010, 1, 1, 1, 1, 1),
+        ...     {"a": 1}, (2,), [1, 2, 3], None)])
+        >>> schema = StructType([
+        ...     StructField("byte", ByteType(), False),
+        ...     StructField("short", ShortType(), False),
+        ...     StructField("float", FloatType(), False),
+        ...     StructField("time", TimestampType(), False),
+        ...     StructField("map",
+        ...         MapType(StringType(), IntegerType(), False), False),
+        ...     StructField("struct",
+        ...         StructType([StructField("b", ShortType(), False)]), False),
+        ...     StructField("list", ArrayType(ByteType(), False), False),
+        ...     StructField("null", DoubleType(), True)])
+        >>> srdd = sqlCtx.applySchema(rdd, schema).map(
+        ...     lambda x: (x.byte, x.short, x.float, x.time,
+        ...         x.map["a"], x.struct.b, x.list, x.null))
+        >>> srdd.collect()[0]
+        (127, -32768, 1.0, ...(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
+
+        >>> rdd = sc.parallelize([(127, -32768, 1.0,
+        ...     datetime(2010, 1, 1, 1, 1, 1),
+        ...     {"a": 1}, (2,), [1, 2, 3])])
+        >>> abstract = "byte short float time map{} struct(b) list[]"
+        >>> schema = _parse_schema_abstract(abstract)
+        >>> typedSchema = _infer_schema_type(rdd.first(), schema)
+        >>> srdd = sqlCtx.applySchema(rdd, typedSchema)
+        >>> srdd.collect()
+        [Row(byte=127, short=-32768, float=1.0, time=..., list=[1, 2, 3])]
+        """
+
+        if isinstance(rdd, SchemaRDD):
+            raise TypeError("Cannot apply schema to SchemaRDD")
+
+        if not isinstance(schema, StructType):
+            raise TypeError("schema should be StructType")
+
+        # take the first few rows to verify schema
+        rows = rdd.take(10)
+        for row in rows:
+            _verify_type(row, schema)
+
+        batched = isinstance(rdd._jrdd_deserializer, BatchedSerializer)
+        jrdd = self._pythonToJava(rdd._jrdd, batched)
+        srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), str(schema))
         return SchemaRDD(srdd, self)
 
     def registerRDDAsTable(self, rdd, tableName):
@@ -137,11 +1119,16 @@ def parquetFile(self, path):
         jschema_rdd = self._ssql_ctx.parquetFile(path)
         return SchemaRDD(jschema_rdd, self)
 
+    def jsonFile(self, path, schema=None):
+        """
+        Loads a text file storing one JSON object per line as a
+        L{SchemaRDD}.
+
+        If the schema is provided, applies the given schema to this
+        JSON dataset.
 
-    def jsonFile(self, path):
-        """Loads a text file storing one JSON object per line,
-           returning the result as a L{SchemaRDD}.
-           It goes through the entire dataset once to determine the schema.
+        Otherwise, it goes through the entire dataset once to determine
+        the schema.
 
         >>> import tempfile, shutil
         >>> jsonFile = tempfile.mkdtemp()
@@ -150,38 +1137,104 @@ def jsonFile(self, path):
         >>> for json in jsonStrings:
         ...   print>>ofn, json
         >>> ofn.close()
-        >>> srdd = sqlCtx.jsonFile(jsonFile)
-        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
-        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")
-        >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
-        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},
-        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]
-        True
+        >>> srdd1 = sqlCtx.jsonFile(jsonFile)
+        >>> sqlCtx.registerRDDAsTable(srdd1, "table1")
+        >>> srdd2 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table1")
+        >>> for r in srdd2.collect():
+        ...     print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+        >>> srdd3 = sqlCtx.jsonFile(jsonFile, srdd1.schema())
+        >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
+        >>> srdd4 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table2")
+        >>> for r in srdd4.collect():
+        ...    print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+        >>> schema = StructType([
+        ...     StructField("field2", StringType(), True),
+        ...     StructField("field3",
+        ...         StructType([
+        ...             StructField("field5",
+        ...                 ArrayType(IntegerType(), False), True)]), False)])
+        >>> srdd5 = sqlCtx.jsonFile(jsonFile, schema)
+        >>> sqlCtx.registerRDDAsTable(srdd5, "table3")
+        >>> srdd6 = sqlCtx.sql(
+        ...   "SELECT field2 AS f1, field3.field5 as f2, "
+        ...   "field3.field5[0] as f3 from table3")
+        >>> srdd6.collect()
+        [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)]
         """
-        jschema_rdd = self._ssql_ctx.jsonFile(path)
+        if schema is None:
+            jschema_rdd = self._ssql_ctx.jsonFile(path)
+        else:
+            scala_datatype = self._ssql_ctx.parseDataType(str(schema))
+            jschema_rdd = self._ssql_ctx.jsonFile(path, scala_datatype)
         return SchemaRDD(jschema_rdd, self)
 
-    def jsonRDD(self, rdd):
-        """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}.
-           It goes through the entire dataset once to determine the schema.
-
-        >>> srdd = sqlCtx.jsonRDD(json)
-        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
-        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")
-        >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
-        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},
-        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]
-        True
+    def jsonRDD(self, rdd, schema=None):
+        """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.
+
+        If the schema is provided, applies the given schema to this
+        JSON dataset.
+
+        Otherwise, it goes through the entire dataset once to determine
+        the schema.
+
+        >>> srdd1 = sqlCtx.jsonRDD(json)
+        >>> sqlCtx.registerRDDAsTable(srdd1, "table1")
+        >>> srdd2 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table1")
+        >>> for r in srdd2.collect():
+        ...     print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+        >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema())
+        >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
+        >>> srdd4 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table2")
+        >>> for r in srdd4.collect():
+        ...     print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+        >>> schema = StructType([
+        ...     StructField("field2", StringType(), True),
+        ...     StructField("field3",
+        ...         StructType([
+        ...             StructField("field5",
+        ...                 ArrayType(IntegerType(), False), True)]), False)])
+        >>> srdd5 = sqlCtx.jsonRDD(json, schema)
+        >>> sqlCtx.registerRDDAsTable(srdd5, "table3")
+        >>> srdd6 = sqlCtx.sql(
+        ...   "SELECT field2 AS f1, field3.field5 as f2, "
+        ...   "field3.field5[0] as f3 from table3")
+        >>> srdd6.collect()
+        [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)]
         """
-        def func(split, iterator):
+
+        def func(iterator):
             for x in iterator:
                 if not isinstance(x, basestring):
                     x = unicode(x)
                 yield x.encode("utf-8")
-        keyed = PipelinedRDD(rdd, func)
+        keyed = rdd.mapPartitions(func)
         keyed._bypass_serializer = True
         jrdd = keyed._jrdd.map(self._jvm.BytesToString())
-        jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
+        if schema is None:
+            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
+        else:
+            scala_datatype = self._ssql_ctx.parseDataType(str(schema))
+            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
         return SchemaRDD(jschema_rdd, self)
 
     def sql(self, sqlQuery):
@@ -190,9 +1243,8 @@ def sql(self, sqlQuery):
         >>> srdd = sqlCtx.inferSchema(rdd)
         >>> sqlCtx.registerRDDAsTable(srdd, "table1")
         >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
-        >>> srdd2.collect() == [{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"},
-        ...                     {"f1" : 3, "f2": "row3"}]
-        True
+        >>> srdd2.collect()
+        [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
         """
         return SchemaRDD(self._ssql_ctx.sql(sqlQuery), self)
 
@@ -230,22 +1282,29 @@ def _ssql_ctx(self):
                 self._scala_HiveContext = self._get_hive_ctx()
             return self._scala_HiveContext
         except Py4JError as e:
-            raise Exception("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run " \
-                            "sbt/sbt assembly" , e)
+            raise Exception("You must build Spark with Hive. "
+                            "Export 'SPARK_HIVE=true' and run "
+                            "sbt/sbt assembly", e)
 
     def _get_hive_ctx(self):
         return self._jvm.HiveContext(self._jsc.sc())
 
     def hiveql(self, hqlQuery):
         """
-        Runs a query expressed in HiveQL, returning the result as a L{SchemaRDD}.
+        DEPRECATED: Use sql()
         """
+        warnings.warn("hiveql() is deprecated as the sql function now parses using HiveQL by" +
+                      "default. The SQL dialect for parsing can be set using 'spark.sql.dialect'",
+                      DeprecationWarning)
         return SchemaRDD(self._ssql_ctx.hiveql(hqlQuery), self)
 
     def hql(self, hqlQuery):
         """
-        Runs a query expressed in HiveQL, returning the result as a L{SchemaRDD}.
+        DEPRECATED: Use sql()
         """
+        warnings.warn("hql() is deprecated as the sql function now parses using HiveQL by" +
+                      "default. The SQL dialect for parsing can be set using 'spark.sql.dialect'",
+                      DeprecationWarning)
         return self.hiveql(hqlQuery)
 
 
@@ -258,13 +1317,17 @@ class LocalHiveContext(HiveContext):
     >>> import os
     >>> hiveCtx = LocalHiveContext(sc)
     >>> try:
-    ...     supress = hiveCtx.hql("DROP TABLE src")
+    ...     supress = hiveCtx.sql("DROP TABLE src")
     ... except Exception:
     ...     pass
-    >>> kv1 = os.path.join(os.environ["SPARK_HOME"], 'examples/src/main/resources/kv1.txt')
-    >>> supress = hiveCtx.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-    >>> supress = hiveCtx.hql("LOAD DATA LOCAL INPATH '%s' INTO TABLE src" % kv1)
-    >>> results = hiveCtx.hql("FROM src SELECT value").map(lambda r: int(r.value.split('_')[1]))
+    >>> kv1 = os.path.join(os.environ["SPARK_HOME"],
+    ...        'examples/src/main/resources/kv1.txt')
+    >>> supress = hiveCtx.sql(
+    ...     "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    >>> supress = hiveCtx.sql("LOAD DATA LOCAL INPATH '%s' INTO TABLE src"
+    ...        % kv1)
+    >>> results = hiveCtx.sql("FROM src SELECT value"
+    ...      ).map(lambda r: int(r.value.split('_')[1]))
     >>> num = results.count()
     >>> reduce_sum = results.reduce(lambda x, y: x + y)
     >>> num
@@ -273,6 +1336,11 @@ class LocalHiveContext(HiveContext):
     130091
     """
 
+    def __init__(self, sparkContext, sqlContext=None):
+        HiveContext.__init__(self, sparkContext, sqlContext)
+        warnings.warn("LocalHiveContext is deprecated. "
+                "Use HiveContext instead.", DeprecationWarning)
+
     def _get_hive_ctx(self):
         return self._jvm.LocalHiveContext(self._jsc.sc())
 
@@ -283,25 +1351,83 @@ def _get_hive_ctx(self):
         return self._jvm.TestHiveContext(self._jsc.sc())
 
 
-# TODO: Investigate if it is more efficient to use a namedtuple. One problem is that named tuples
-# are custom classes that must be generated per Schema.
-class Row(dict):
-    """A row in L{SchemaRDD}.
+def _create_row(fields, values):
+    row = Row(*values)
+    row.__FIELDS__ = fields
+    return row
 
-    An extended L{dict} that takes a L{dict} in its constructor, and
-    exposes those items as fields.
 
-    >>> r = Row({"hello" : "world", "foo" : "bar"})
-    >>> r.hello
-    'world'
-    >>> r.foo
-    'bar'
+class Row(tuple):
     """
+    A row in L{SchemaRDD}. The fields in it can be accessed like attributes.
+
+    Row can be used to create a row object by using named arguments,
+    the fields will be sorted by names.
 
-    def __init__(self, d):
-        d.update(self.__dict__)
-        self.__dict__ = d
-        dict.__init__(self, d)
+    >>> row = Row(name="Alice", age=11)
+    >>> row
+    Row(age=11, name='Alice')
+    >>> row.name, row.age
+    ('Alice', 11)
+
+    Row also can be used to create another Row like class, then it
+    could be used to create Row objects, such as
+
+    >>> Person = Row("name", "age")
+    >>> Person
+    <Row(name, age)>
+    >>> Person("Alice", 11)
+    Row(name='Alice', age=11)
+    """
+
+    def __new__(self, *args, **kwargs):
+        if args and kwargs:
+            raise ValueError("Can not use both args "
+                             "and kwargs to create Row")
+        if args:
+            # create row class or objects
+            return tuple.__new__(self, args)
+
+        elif kwargs:
+            # create row objects
+            names = sorted(kwargs.keys())
+            values = tuple(kwargs[n] for n in names)
+            row = tuple.__new__(self, values)
+            row.__FIELDS__ = names
+            return row
+
+        else:
+            raise ValueError("No args or kwargs")
+
+
+    # let obect acs like class
+    def __call__(self, *args):
+        """create new Row object"""
+        return _create_row(self, args)
+
+    def __getattr__(self, item):
+        if item.startswith("__"):
+            raise AttributeError(item)
+        try:
+            # it will be slow when it has many fields,
+            # but this will not be used in normal cases
+            idx = self.__FIELDS__.index(item)
+            return self[idx]
+        except IndexError:
+            raise AttributeError(item)
+
+    def __reduce__(self):
+        if hasattr(self, "__FIELDS__"):
+            return (_create_row, (self.__FIELDS__, tuple(self)))
+        else:
+            return tuple.__reduce__(self)
+
+    def __repr__(self):
+        if hasattr(self, "__FIELDS__"):
+            return "Row(%s)" % ", ".join("%s=%r" % (k, v)
+                for k, v in zip(self.__FIELDS__, self))
+        else:
+            return "<Row(%s)>" % ", ".join(self)
 
 
 class SchemaRDD(RDD):
@@ -315,6 +1441,10 @@ class SchemaRDD(RDD):
     implementation is an RDD composed of Java objects. Instead it is
     converted to a PythonRDD in the JVM, on which Python operations can
     be done.
+
+    This class receives raw tuples from Java but assigns a class to it in
+    all its data-collection methods (mapPartitionsWithIndex, collect, take,
+    etc) so that PySpark sees them as Row objects with named fields.
     """
 
     def __init__(self, jschema_rdd, sql_ctx):
@@ -325,7 +1455,8 @@ def __init__(self, jschema_rdd, sql_ctx):
         self.is_cached = False
         self.is_checkpointed = False
         self.ctx = self.sql_ctx._sc
-        self._jrdd_deserializer = self.ctx.serializer
+        # the _jrdd is created by javaToPython(), serialized by pickle
+        self._jrdd_deserializer = BatchedSerializer(PickleSerializer())
 
     @property
     def _jrdd(self):
@@ -335,7 +1466,7 @@ def _jrdd(self):
         L{pyspark.rdd.RDD} super class (map, filter, etc.).
         """
         if not hasattr(self, '_lazy_jrdd'):
-            self._lazy_jrdd = self._toPython()._jrdd
+            self._lazy_jrdd = self._jschema_rdd.javaToPython()
         return self._lazy_jrdd
 
     @property
@@ -359,21 +1490,25 @@ def saveAsParquetFile(self, path):
         """
         self._jschema_rdd.saveAsParquetFile(path)
 
-    def registerAsTable(self, name):
+    def registerTempTable(self, name):
         """Registers this RDD as a temporary table using the given name.
 
         The lifetime of this temporary table is tied to the L{SQLContext}
         that was used to create this SchemaRDD.
 
         >>> srdd = sqlCtx.inferSchema(rdd)
-        >>> srdd.registerAsTable("test")
+        >>> srdd.registerTempTable("test")
         >>> srdd2 = sqlCtx.sql("select * from test")
         >>> sorted(srdd.collect()) == sorted(srdd2.collect())
         True
         """
-        self._jschema_rdd.registerAsTable(name)
+        self._jschema_rdd.registerTempTable(name)
+
+    def registerAsTable(self, name):
+        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+        self.registerTempTable(name)
 
-    def insertInto(self, tableName, overwrite = False):
+    def insertInto(self, tableName, overwrite=False):
         """Inserts the contents of this SchemaRDD into the specified table.
 
         Optionally overwriting any existing data.
@@ -384,6 +1519,11 @@ def saveAsTable(self, tableName):
         """Creates a new table with the contents of this SchemaRDD."""
         self._jschema_rdd.saveAsTable(tableName)
 
+    def schema(self):
+        """Returns the schema of this SchemaRDD (represented by
+        a L{StructType})."""
+        return _parse_datatype_string(self._jschema_rdd.schema().toString())
+
     def schemaString(self):
         """Returns the output schema in the tree format."""
         return self._jschema_rdd.schemaString()
@@ -407,19 +1547,45 @@ def count(self):
         """
         return self._jschema_rdd.count()
 
-    def _toPython(self):
-        # We have to import the Row class explicitly, so that the reference Pickler has is
-        # pyspark.sql.Row instead of __main__.Row
-        from pyspark.sql import Row
-        jrdd = self._jschema_rdd.javaToPython()
-        # TODO: This is inefficient, we should construct the Python Row object
-        # in Java land in the javaToPython function. May require a custom
-        # pickle serializer in Pyrolite
-        return RDD(jrdd, self._sc, BatchedSerializer(
-                        PickleSerializer())).map(lambda d: Row(d))
-
-    # We override the default cache/persist/checkpoint behavior as we want to cache the underlying
-    # SchemaRDD object in the JVM, not the PythonRDD checkpointed by the super class
+    def collect(self):
+        """
+        Return a list that contains all of the rows in this RDD.
+
+        Each object in the list is on Row, the fields can be accessed as
+        attributes.
+        """
+        rows = RDD.collect(self)
+        cls = _create_cls(self.schema())
+        return map(cls, rows)
+
+    # Convert each object in the RDD to a Row with the right class
+    # for this SchemaRDD, so that fields can be accessed as attributes.
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+        Return a new RDD by applying a function to each partition of this RDD,
+        while tracking the index of the original partition.
+
+        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
+        >>> def f(splitIndex, iterator): yield splitIndex
+        >>> rdd.mapPartitionsWithIndex(f).sum()
+        6
+        """
+        rdd = RDD(self._jrdd, self._sc, self._jrdd_deserializer)
+
+        schema = self.schema()
+        import pickle
+        pickle.loads(pickle.dumps(schema))
+
+        def applySchema(_, it):
+            cls = _create_cls(schema)
+            return itertools.imap(cls, it)
+
+        objrdd = rdd.mapPartitionsWithIndex(applySchema, preservesPartitioning)
+        return objrdd.mapPartitionsWithIndex(f, preservesPartitioning)
+
+    # We override the default cache/persist/checkpoint behavior
+    # as we want to cache the underlying SchemaRDD object in the JVM,
+    # not the PythonRDD checkpointed by the super class
     def cache(self):
         self.is_cached = True
         self._jschema_rdd.cache()
@@ -431,9 +1597,9 @@ def persist(self, storageLevel):
         self._jschema_rdd.persist(javaStorageLevel)
         return self
 
-    def unpersist(self):
+    def unpersist(self, blocking=True):
         self.is_cached = False
-        self._jschema_rdd.unpersist()
+        self._jschema_rdd.unpersist(blocking)
         return self
 
     def checkpoint(self):
@@ -474,35 +1640,42 @@ def subtract(self, other, numPartitions=None):
             if numPartitions is None:
                 rdd = self._jschema_rdd.subtract(other._jschema_rdd)
             else:
-                rdd = self._jschema_rdd.subtract(other._jschema_rdd, numPartitions)
+                rdd = self._jschema_rdd.subtract(other._jschema_rdd,
+                        numPartitions)
             return SchemaRDD(rdd, self.sql_ctx)
         else:
             raise ValueError("Can only subtract another SchemaRDD")
 
+
 def _test():
     import doctest
     from array import array
     from pyspark.context import SparkContext
-    globs = globals().copy()
+    # let doctest run in pyspark.sql, so DataTypes can be picklable
+    import pyspark.sql
+    from pyspark.sql import Row, SQLContext
+    globs = pyspark.sql.__dict__.copy()
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
     globs['sc'] = sc
     globs['sqlCtx'] = SQLContext(sc)
-    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
-        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
-    jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
-       '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}',
-       '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}']
+    globs['rdd'] = sc.parallelize(
+        [Row(field1=1, field2="row1"),
+         Row(field1=2, field2="row2"),
+         Row(field1=3, field2="row3")]
+    )
+    jsonStrings = [
+        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
+        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
+            '"field6":[{"field7": "row2"}]}',
+        '{"field1" : null, "field2": "row3", '
+            '"field3":{"field4":33, "field5": []}}'
+    ]
     globs['jsonStrings'] = jsonStrings
     globs['json'] = sc.parallelize(jsonStrings)
-    globs['nestedRdd1'] = sc.parallelize([
-        {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
-        {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}])
-    globs['nestedRdd2'] = sc.parallelize([
-        {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
-        {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}])
-    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql, globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
@@ -510,4 +1683,3 @@ def _test():
 
 if __name__ == "__main__":
     _test()
-
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 080325061a697..1e597d64e03fe 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -20,29 +20,35 @@
 import copy
 import math
 
+try:
+    from numpy import maximum, minimum, sqrt
+except ImportError:
+    maximum = max
+    minimum = min
+    sqrt = math.sqrt
+
+
 class StatCounter(object):
-    
+
     def __init__(self, values=[]):
         self.n = 0L    # Running count of our values
         self.mu = 0.0  # Running mean of our values
         self.m2 = 0.0  # Running variance numerator (sum of (x - mean)^2)
         self.maxValue = float("-inf")
         self.minValue = float("inf")
-        
+
         for v in values:
             self.merge(v)
-            
+
     # Add a value into this StatCounter, updating the internal statistics.
     def merge(self, value):
         delta = value - self.mu
         self.n += 1
         self.mu += delta / self.n
         self.m2 += delta * (value - self.mu)
-        if self.maxValue < value:
-            self.maxValue = value
-        if self.minValue > value:
-            self.minValue = value
-            
+        self.maxValue = maximum(self.maxValue, value)
+        self.minValue = minimum(self.minValue, value)
+
         return self
 
     # Merge another StatCounter into this one, adding up the internal statistics.
@@ -50,7 +56,7 @@ def mergeStats(self, other):
         if not isinstance(other, StatCounter):
             raise Exception("Can only merge Statcounters!")
 
-        if other is self: # reference equality holds
+        if other is self:  # reference equality holds
             self.merge(copy.deepcopy(other))  # Avoid overwriting fields in a weird order
         else:
             if self.n == 0:
@@ -59,8 +65,8 @@ def mergeStats(self, other):
                 self.n = other.n
                 self.maxValue = other.maxValue
                 self.minValue = other.minValue
-                
-            elif other.n != 0:        
+
+            elif other.n != 0:
                 delta = other.mu - self.mu
                 if other.n * 10 < self.n:
                     self.mu = self.mu + (delta * other.n) / (self.n + other.n)
@@ -68,10 +74,10 @@ def mergeStats(self, other):
                     self.mu = other.mu - (delta * self.n) / (self.n + other.n)
                 else:
                     self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
-                
-                    self.maxValue = max(self.maxValue, other.maxValue)
-                    self.minValue = min(self.minValue, other.minValue)
-        
+
+                self.maxValue = maximum(self.maxValue, other.maxValue)
+                self.minValue = minimum(self.minValue, other.minValue)
+
                 self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
                 self.n += other.n
         return self
@@ -94,7 +100,7 @@ def min(self):
 
     def max(self):
         return self.maxValue
-    
+
     # Return the variance of the values.
     def variance(self):
         if self.n == 0:
@@ -114,15 +120,15 @@ def sampleVariance(self):
 
     # Return the standard deviation of the values.
     def stdev(self):
-        return math.sqrt(self.variance())
+        return sqrt(self.variance())
 
     #
     # Return the sample standard deviation of the values, which corrects for bias in estimating the
     # variance by dividing by N-1 instead of N.
     #
     def sampleStdev(self):
-        return math.sqrt(self.sampleVariance())
+        return sqrt(self.sampleVariance())
 
     def __repr__(self):
-        return "(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % (self.count(), self.mean(), self.stdev(), self.max(), self.min())
-
+        return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" %
+                (self.count(), self.mean(), self.stdev(), self.max(), self.min()))
diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py
index 3a18ea54eae4c..5d77a131f2856 100644
--- a/python/pyspark/storagelevel.py
+++ b/python/pyspark/storagelevel.py
@@ -17,6 +17,7 @@
 
 __all__ = ["StorageLevel"]
 
+
 class StorageLevel:
     """
     Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
@@ -25,7 +26,7 @@ class StorageLevel:
     Also contains static constants for some commonly used storage levels, such as MEMORY_ONLY.
     """
 
-    def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication = 1):
+    def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication=1):
         self.useDisk = useDisk
         self.useMemory = useMemory
         self.useOffHeap = useOffHeap
@@ -55,4 +56,4 @@ def __str__(self):
 StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, True, 2)
 StorageLevel.MEMORY_AND_DISK_SER = StorageLevel(True, True, False, False)
 StorageLevel.MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, False, 2)
-StorageLevel.OFF_HEAP = StorageLevel(False, False, True, False, 1)
\ No newline at end of file
+StorageLevel.OFF_HEAP = StorageLevel(False, False, True, False, 1)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index c15bb457759ed..acc3c30371621 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -19,6 +19,7 @@
 Unit tests for PySpark; additional tests are implemented as doctests in
 individual modules.
 """
+from array import array
 from fileinput import input
 from glob import glob
 import os
@@ -34,30 +35,95 @@
 from pyspark.context import SparkContext
 from pyspark.files import SparkFiles
 from pyspark.serializers import read_int
+from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger
 
 _have_scipy = False
+_have_numpy = False
 try:
     import scipy.sparse
     _have_scipy = True
 except:
     # No SciPy, but that's okay, we'll skip those tests
     pass
+try:
+    import numpy as np
+    _have_numpy = True
+except:
+    # No NumPy, but that's okay, we'll skip those tests
+    pass
 
 
 SPARK_HOME = os.environ["SPARK_HOME"]
 
 
+class TestMerger(unittest.TestCase):
+
+    def setUp(self):
+        self.N = 1 << 16
+        self.l = [i for i in xrange(self.N)]
+        self.data = zip(self.l, self.l)
+        self.agg = Aggregator(lambda x: [x], 
+                lambda x, y: x.append(y) or x,
+                lambda x, y: x.extend(y) or x)
+
+    def test_in_memory(self):
+        m = InMemoryMerger(self.agg)
+        m.mergeValues(self.data)
+        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
+                sum(xrange(self.N)))
+
+        m = InMemoryMerger(self.agg)
+        m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data))
+        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
+                sum(xrange(self.N)))
+
+    def test_small_dataset(self):
+        m = ExternalMerger(self.agg, 1000)
+        m.mergeValues(self.data)
+        self.assertEqual(m.spills, 0)
+        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
+                sum(xrange(self.N)))
+
+        m = ExternalMerger(self.agg, 1000)
+        m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data))
+        self.assertEqual(m.spills, 0)
+        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
+                sum(xrange(self.N)))
+
+    def test_medium_dataset(self):
+        m = ExternalMerger(self.agg, 10)
+        m.mergeValues(self.data)
+        self.assertTrue(m.spills >= 1)
+        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
+                sum(xrange(self.N)))
+
+        m = ExternalMerger(self.agg, 10)
+        m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data * 3))
+        self.assertTrue(m.spills >= 1)
+        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
+                sum(xrange(self.N)) * 3)
+
+    def test_huge_dataset(self):
+        m = ExternalMerger(self.agg, 10)
+        m.mergeCombiners(map(lambda (k, v): (k, [str(v)]), self.data * 10))
+        self.assertTrue(m.spills >= 1)
+        self.assertEqual(sum(len(v) for k, v in m._recursive_merged_items(0)),
+                self.N * 10)
+        m._cleanup()
+
+
 class PySparkTestCase(unittest.TestCase):
 
     def setUp(self):
         self._old_sys_path = list(sys.path)
         class_name = self.__class__.__name__
-        self.sc = SparkContext('local[4]', class_name , batchSize=2)
+        self.sc = SparkContext('local[4]', class_name, batchSize=2)
 
     def tearDown(self):
         self.sc.stop()
         sys.path = self._old_sys_path
 
+
 class TestCheckpoint(PySparkTestCase):
 
     def setUp(self):
@@ -107,11 +173,17 @@ class TestAddFile(PySparkTestCase):
     def test_add_py_file(self):
         # To ensure that we're actually testing addPyFile's effects, check that
         # this job fails due to `userlibrary` not being on the Python path:
+        # disable logging in log4j temporarily
+        log4j = self.sc._jvm.org.apache.log4j
+        old_level = log4j.LogManager.getRootLogger().getLevel()
+        log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL)
         def func(x):
             from userlibrary import UserClass
             return UserClass().hello()
         self.assertRaises(Exception,
                           self.sc.parallelize(range(2)).map(func).first)
+        log4j.LogManager.getRootLogger().setLevel(old_level)
+
         # Add the file, so the job should now succeed:
         path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")
         self.sc.addPyFile(path)
@@ -151,6 +223,12 @@ def func():
 
 class TestRDDFunctions(PySparkTestCase):
 
+    def test_failed_sparkcontext_creation(self):
+        # Regression test for SPARK-1550
+        self.sc.stop()
+        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
+        self.sc = SparkContext("local")
+
     def test_save_as_textfile_with_unicode(self):
         # Regression test for SPARK-970
         x = u"\u00A1Hola, mundo!"
@@ -168,6 +246,15 @@ def test_transforming_cartesian_result(self):
         cart = rdd1.cartesian(rdd2)
         result = cart.map(lambda (x, y): x + y).collect()
 
+    def test_transforming_pickle_file(self):
+        # Regression test for SPARK-2601
+        data = self.sc.parallelize(["Hello", "World!"])
+        tempFile = tempfile.NamedTemporaryFile(delete=True)
+        tempFile.close()
+        data.saveAsPickleFile(tempFile.name)
+        pickled_file = self.sc.pickleFile(tempFile.name)
+        pickled_file.map(lambda x: x).collect()
+
     def test_cartesian_on_textfile(self):
         # Regression test for
         path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
@@ -190,6 +277,7 @@ def test_deleting_input_files(self):
 
     def testAggregateByKey(self):
         data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
+
         def seqOp(x, y):
             x.add(y)
             return x
@@ -197,17 +285,25 @@ def seqOp(x, y):
         def combOp(x, y):
             x |= y
             return x
-          
+
         sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
         self.assertEqual(3, len(sets))
         self.assertEqual(set([1]), sets[1])
         self.assertEqual(set([2]), sets[3])
         self.assertEqual(set([1, 3]), sets[5])
 
+    def test_itemgetter(self):
+        rdd = self.sc.parallelize([range(10)])
+        from operator import itemgetter
+        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
+        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())
+
+
 class TestIO(PySparkTestCase):
 
     def test_stdout_redirection(self):
         import subprocess
+
         def func(x):
             subprocess.check_call('ls', shell=True)
         self.sc.parallelize([1]).foreach(func)
@@ -239,6 +335,17 @@ def test_sequencefiles(self):
         ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
         self.assertEqual(doubles, ed)
 
+        bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/",
+                                              "org.apache.hadoop.io.IntWritable",
+                                              "org.apache.hadoop.io.BytesWritable").collect())
+        ebs = [(1, bytearray('aa', 'utf-8')),
+               (1, bytearray('aa', 'utf-8')),
+               (2, bytearray('aa', 'utf-8')),
+               (2, bytearray('bb', 'utf-8')),
+               (2, bytearray('bb', 'utf-8')),
+               (3, bytearray('cc', 'utf-8'))]
+        self.assertEqual(bytes, ebs)
+
         text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/",
                                            "org.apache.hadoop.io.Text",
                                            "org.apache.hadoop.io.Text").collect())
@@ -265,14 +372,34 @@ def test_sequencefiles(self):
         maps = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfmap/",
                                            "org.apache.hadoop.io.IntWritable",
                                            "org.apache.hadoop.io.MapWritable").collect())
-        em = [(1, {2.0: u'aa'}),
+        em = [(1, {}),
               (1, {3.0: u'bb'}),
               (2, {1.0: u'aa'}),
               (2, {1.0: u'cc'}),
-              (2, {3.0: u'bb'}),
               (3, {2.0: u'dd'})]
         self.assertEqual(maps, em)
 
+        # arrays get pickled to tuples by default
+        tuples = sorted(self.sc.sequenceFile(
+            basepath + "/sftestdata/sfarray/",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.spark.api.python.DoubleArrayWritable").collect())
+        et = [(1, ()),
+              (2, (3.0, 4.0, 5.0)),
+              (3, (4.0, 5.0, 6.0))]
+        self.assertEqual(tuples, et)
+
+        # with custom converters, primitive arrays can stay as arrays
+        arrays = sorted(self.sc.sequenceFile(
+            basepath + "/sftestdata/sfarray/",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.spark.api.python.DoubleArrayWritable",
+            valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
+        ea = [(1, array('d')),
+              (2, array('d', [3.0, 4.0, 5.0])),
+              (3, array('d', [4.0, 5.0, 6.0]))]
+        self.assertEqual(arrays, ea)
+
         clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
                                             "org.apache.hadoop.io.Text",
                                             "org.apache.spark.api.python.TestWritable").collect())
@@ -281,6 +408,12 @@ def test_sequencefiles(self):
                u'double': 54.0, u'int': 123, u'str': u'test1'})
         self.assertEqual(clazz[0], ec)
 
+        unbatched_clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
+                                            "org.apache.hadoop.io.Text",
+                                            "org.apache.spark.api.python.TestWritable",
+                                            batchSize=1).collect())
+        self.assertEqual(unbatched_clazz[0], ec)
+
     def test_oldhadoop(self):
         basepath = self.tempdir.name
         ints = sorted(self.sc.hadoopFile(basepath + "/sftestdata/sfint/",
@@ -291,10 +424,11 @@ def test_oldhadoop(self):
         self.assertEqual(ints, ei)
 
         hellopath = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
-        hello = self.sc.hadoopFile(hellopath,
-                                   "org.apache.hadoop.mapred.TextInputFormat",
-                                   "org.apache.hadoop.io.LongWritable",
-                                   "org.apache.hadoop.io.Text").collect()
+        oldconf = {"mapred.input.dir" : hellopath}
+        hello = self.sc.hadoopRDD("org.apache.hadoop.mapred.TextInputFormat",
+                                  "org.apache.hadoop.io.LongWritable",
+                                  "org.apache.hadoop.io.Text",
+                                  conf=oldconf).collect()
         result = [(0, u'Hello World!')]
         self.assertEqual(hello, result)
 
@@ -309,10 +443,11 @@ def test_newhadoop(self):
         self.assertEqual(ints, ei)
 
         hellopath = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
-        hello = self.sc.newAPIHadoopFile(hellopath,
-                                         "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
-                                         "org.apache.hadoop.io.LongWritable",
-                                         "org.apache.hadoop.io.Text").collect()
+        newconf = {"mapred.input.dir" : hellopath}
+        hello = self.sc.newAPIHadoopRDD("org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
+                                        "org.apache.hadoop.io.LongWritable",
+                                        "org.apache.hadoop.io.Text",
+                                        conf=newconf).collect()
         result = [(0, u'Hello World!')]
         self.assertEqual(hello, result)
 
@@ -347,16 +482,267 @@ def test_bad_inputs(self):
             "org.apache.hadoop.io.IntWritable",
             "org.apache.hadoop.io.Text"))
 
-    def test_converter(self):
+    def test_converters(self):
+        # use of custom converters
         basepath = self.tempdir.name
         maps = sorted(self.sc.sequenceFile(
             basepath + "/sftestdata/sfmap/",
             "org.apache.hadoop.io.IntWritable",
             "org.apache.hadoop.io.MapWritable",
-            valueConverter="org.apache.spark.api.python.TestConverter").collect())
-        em = [(1, [2.0]), (1, [3.0]), (2, [1.0]), (2, [1.0]), (2, [3.0]), (3, [2.0])]
+            keyConverter="org.apache.spark.api.python.TestInputKeyConverter",
+            valueConverter="org.apache.spark.api.python.TestInputValueConverter").collect())
+        em = [(u'\x01', []),
+              (u'\x01', [3.0]),
+              (u'\x02', [1.0]),
+              (u'\x02', [1.0]),
+              (u'\x03', [2.0])]
+        self.assertEqual(maps, em)
+
+class TestOutputFormat(PySparkTestCase):
+
+    def setUp(self):
+        PySparkTestCase.setUp(self)
+        self.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        os.unlink(self.tempdir.name)
+
+    def tearDown(self):
+        PySparkTestCase.tearDown(self)
+        shutil.rmtree(self.tempdir.name, ignore_errors=True)
+
+    def test_sequencefiles(self):
+        basepath = self.tempdir.name
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/")
+        ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect())
+        self.assertEqual(ints, ei)
+
+        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
+        self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/")
+        doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect())
+        self.assertEqual(doubles, ed)
+
+        ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))]
+        self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/")
+        bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect())
+        self.assertEqual(bytes, ebs)
+
+        et = [(u'1', u'aa'),
+              (u'2', u'bb'),
+              (u'3', u'cc')]
+        self.sc.parallelize(et).saveAsSequenceFile(basepath + "/sftext/")
+        text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect())
+        self.assertEqual(text, et)
+
+        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
+        self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/")
+        bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect())
+        self.assertEqual(bools, eb)
+
+        en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)]
+        self.sc.parallelize(en).saveAsSequenceFile(basepath + "/sfnull/")
+        nulls = sorted(self.sc.sequenceFile(basepath + "/sfnull/").collect())
+        self.assertEqual(nulls, en)
+
+        em = [(1, {}),
+              (1, {3.0: u'bb'}),
+              (2, {1.0: u'aa'}),
+              (2, {1.0: u'cc'}),
+              (3, {2.0: u'dd'})]
+        self.sc.parallelize(em).saveAsSequenceFile(basepath + "/sfmap/")
+        maps = sorted(self.sc.sequenceFile(basepath + "/sfmap/").collect())
         self.assertEqual(maps, em)
 
+    def test_oldhadoop(self):
+        basepath = self.tempdir.name
+        dict_data = [(1, {}),
+                     (1, {"row1" : 1.0}),
+                     (2, {"row2" : 2.0})]
+        self.sc.parallelize(dict_data).saveAsHadoopFile(
+            basepath + "/oldhadoop/",
+            "org.apache.hadoop.mapred.SequenceFileOutputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.MapWritable")
+        result = sorted(self.sc.hadoopFile(
+            basepath + "/oldhadoop/",
+            "org.apache.hadoop.mapred.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.MapWritable").collect())
+        self.assertEqual(result, dict_data)
+
+        conf = {
+            "mapred.output.format.class" : "org.apache.hadoop.mapred.SequenceFileOutputFormat",
+            "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
+            "mapred.output.value.class" : "org.apache.hadoop.io.MapWritable",
+            "mapred.output.dir" : basepath + "/olddataset/"}
+        self.sc.parallelize(dict_data).saveAsHadoopDataset(conf)
+        input_conf = {"mapred.input.dir" : basepath + "/olddataset/"}
+        old_dataset = sorted(self.sc.hadoopRDD(
+            "org.apache.hadoop.mapred.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.MapWritable",
+            conf=input_conf).collect())
+        self.assertEqual(old_dataset, dict_data)
+
+    def test_newhadoop(self):
+        basepath = self.tempdir.name
+        # use custom ArrayWritable types and converters to handle arrays
+        array_data = [(1, array('d')),
+                      (1, array('d', [1.0, 2.0, 3.0])),
+                      (2, array('d', [3.0, 4.0, 5.0]))]
+        self.sc.parallelize(array_data).saveAsNewAPIHadoopFile(
+            basepath + "/newhadoop/",
+            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.spark.api.python.DoubleArrayWritable",
+            valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
+        result = sorted(self.sc.newAPIHadoopFile(
+            basepath + "/newhadoop/",
+            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.spark.api.python.DoubleArrayWritable",
+            valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
+        self.assertEqual(result, array_data)
+
+        conf = {"mapreduce.outputformat.class" :
+                     "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
+                 "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
+                 "mapred.output.value.class" : "org.apache.spark.api.python.DoubleArrayWritable",
+                 "mapred.output.dir" : basepath + "/newdataset/"}
+        self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset(conf,
+            valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
+        input_conf = {"mapred.input.dir" : basepath + "/newdataset/"}
+        new_dataset = sorted(self.sc.newAPIHadoopRDD(
+            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.spark.api.python.DoubleArrayWritable",
+            valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter",
+            conf=input_conf).collect())
+        self.assertEqual(new_dataset, array_data)
+
+    def test_newolderror(self):
+        basepath = self.tempdir.name
+        rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x ))
+        self.assertRaises(Exception, lambda: rdd.saveAsHadoopFile(
+            basepath + "/newolderror/saveAsHadoopFile/",
+            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"))
+        self.assertRaises(Exception, lambda: rdd.saveAsNewAPIHadoopFile(
+            basepath + "/newolderror/saveAsNewAPIHadoopFile/",
+            "org.apache.hadoop.mapred.SequenceFileOutputFormat"))
+
+    def test_bad_inputs(self):
+        basepath = self.tempdir.name
+        rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x ))
+        self.assertRaises(Exception, lambda: rdd.saveAsHadoopFile(
+            basepath + "/badinputs/saveAsHadoopFile/",
+            "org.apache.hadoop.mapred.NotValidOutputFormat"))
+        self.assertRaises(Exception, lambda: rdd.saveAsNewAPIHadoopFile(
+            basepath + "/badinputs/saveAsNewAPIHadoopFile/",
+            "org.apache.hadoop.mapreduce.lib.output.NotValidOutputFormat"))
+
+    def test_converters(self):
+        # use of custom converters
+        basepath = self.tempdir.name
+        data = [(1, {3.0: u'bb'}),
+                (2, {1.0: u'aa'}),
+                (3, {2.0: u'dd'})]
+        self.sc.parallelize(data).saveAsNewAPIHadoopFile(
+            basepath + "/converters/",
+            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
+            keyConverter="org.apache.spark.api.python.TestOutputKeyConverter",
+            valueConverter="org.apache.spark.api.python.TestOutputValueConverter")
+        converted = sorted(self.sc.sequenceFile(basepath + "/converters/").collect())
+        expected = [(u'1', 3.0),
+                    (u'2', 1.0),
+                    (u'3', 2.0)]
+        self.assertEqual(converted, expected)
+
+    def test_reserialization(self):
+        basepath = self.tempdir.name
+        x = range(1, 5)
+        y = range(1001, 1005)
+        data = zip(x, y)
+        rdd = self.sc.parallelize(x).zip(self.sc.parallelize(y))
+        rdd.saveAsSequenceFile(basepath + "/reserialize/sequence")
+        result1 = sorted(self.sc.sequenceFile(basepath + "/reserialize/sequence").collect())
+        self.assertEqual(result1, data)
+
+        rdd.saveAsHadoopFile(basepath + "/reserialize/hadoop",
+                             "org.apache.hadoop.mapred.SequenceFileOutputFormat")
+        result2 = sorted(self.sc.sequenceFile(basepath + "/reserialize/hadoop").collect())
+        self.assertEqual(result2, data)
+
+        rdd.saveAsNewAPIHadoopFile(basepath + "/reserialize/newhadoop",
+                             "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
+        result3 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newhadoop").collect())
+        self.assertEqual(result3, data)
+
+        conf4 = {
+            "mapred.output.format.class" : "org.apache.hadoop.mapred.SequenceFileOutputFormat",
+            "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
+            "mapred.output.value.class" : "org.apache.hadoop.io.IntWritable",
+            "mapred.output.dir" : basepath + "/reserialize/dataset"}
+        rdd.saveAsHadoopDataset(conf4)
+        result4 = sorted(self.sc.sequenceFile(basepath + "/reserialize/dataset").collect())
+        self.assertEqual(result4, data)
+
+        conf5 = {"mapreduce.outputformat.class" :
+                     "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
+            "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
+            "mapred.output.value.class" : "org.apache.hadoop.io.IntWritable",
+            "mapred.output.dir" : basepath + "/reserialize/newdataset"}
+        rdd.saveAsNewAPIHadoopDataset(conf5)
+        result5 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newdataset").collect())
+        self.assertEqual(result5, data)
+
+    def test_unbatched_save_and_read(self):
+        basepath = self.tempdir.name
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        self.sc.parallelize(ei, numSlices=len(ei)).saveAsSequenceFile(
+            basepath + "/unbatched/")
+
+        unbatched_sequence = sorted(self.sc.sequenceFile(basepath + "/unbatched/",
+            batchSize=1).collect())
+        self.assertEqual(unbatched_sequence, ei)
+
+        unbatched_hadoopFile = sorted(self.sc.hadoopFile(basepath + "/unbatched/",
+            "org.apache.hadoop.mapred.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.Text",
+            batchSize=1).collect())
+        self.assertEqual(unbatched_hadoopFile, ei)
+
+        unbatched_newAPIHadoopFile = sorted(self.sc.newAPIHadoopFile(basepath + "/unbatched/",
+            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.Text",
+            batchSize=1).collect())
+        self.assertEqual(unbatched_newAPIHadoopFile, ei)
+
+        oldconf = {"mapred.input.dir" : basepath + "/unbatched/"}
+        unbatched_hadoopRDD = sorted(self.sc.hadoopRDD(
+            "org.apache.hadoop.mapred.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.Text",
+            conf=oldconf,
+            batchSize=1).collect())
+        self.assertEqual(unbatched_hadoopRDD, ei)
+
+        newconf = {"mapred.input.dir" : basepath + "/unbatched/"}
+        unbatched_newAPIHadoopRDD = sorted(self.sc.newAPIHadoopRDD(
+            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.Text",
+            conf=newconf,
+            batchSize=1).collect())
+        self.assertEqual(unbatched_newAPIHadoopRDD, ei)
+
+    def test_malformed_RDD(self):
+        basepath = self.tempdir.name
+        # non-batch-serialized RDD[[(K, V)]] should be rejected
+        data = [[(1, "a")], [(2, "aa")], [(3, "aaa")]]
+        rdd = self.sc.parallelize(data, numSlices=len(data))
+        self.assertRaises(Exception, lambda: rdd.saveAsSequenceFile(
+            basepath + "/malformed/sequence"))
 
 class TestDaemon(unittest.TestCase):
     def connect(self, port):
@@ -404,6 +790,57 @@ def test_termination_sigterm(self):
         self.do_termination_test(lambda daemon: os.kill(daemon.pid, SIGTERM))
 
 
+class TestWorker(PySparkTestCase):
+    def test_cancel_task(self):
+        temp = tempfile.NamedTemporaryFile(delete=True)
+        temp.close()
+        path = temp.name
+        def sleep(x):
+            import os, time
+            with open(path, 'w') as f:
+                f.write("%d %d" % (os.getppid(), os.getpid()))
+            time.sleep(100)
+
+        # start job in background thread
+        def run():
+            self.sc.parallelize(range(1)).foreach(sleep)
+        import threading
+        t = threading.Thread(target=run)
+        t.daemon = True
+        t.start()
+
+        daemon_pid, worker_pid = 0, 0
+        while True:
+            if os.path.exists(path):
+                data = open(path).read().split(' ')
+                daemon_pid, worker_pid = map(int, data)
+                break
+            time.sleep(0.1)
+
+        # cancel jobs
+        self.sc.cancelAllJobs()
+        t.join()
+
+        for i in range(50):
+            try:
+                os.kill(worker_pid, 0)
+                time.sleep(0.1)
+            except OSError:
+                break # worker was killed
+        else:
+            self.fail("worker has not been killed after 5 seconds")
+
+        try:
+            os.kill(daemon_pid, 0)
+        except OSError:
+            self.fail("daemon had been killed")
+
+    def test_fd_leak(self):
+        N = 1100 # fd limit is 1024 by default
+        rdd = self.sc.parallelize(range(N), N)
+        self.assertEquals(N, rdd.count())
+
+
 class TestSparkSubmit(unittest.TestCase):
     def setUp(self):
         self.programDir = tempfile.mkdtemp()
@@ -479,7 +916,7 @@ def test_module_dependency(self):
             |    return x + 1
             """)
         proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script],
-            stdout=subprocess.PIPE)
+                                stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 3, 4]", out)
@@ -535,9 +972,26 @@ def test_serialize(self):
         self.assertEqual(expected, observed)
 
 
+@unittest.skipIf(not _have_numpy, "NumPy not installed")
+class NumPyTests(PySparkTestCase):
+    """General PySpark tests that depend on numpy """
+
+    def test_statcounter_array(self):
+        x = self.sc.parallelize([np.array([1.0,1.0]), np.array([2.0,2.0]), np.array([3.0,3.0])])
+        s = x.stats()
+        self.assertSequenceEqual([2.0,2.0], s.mean().tolist())
+        self.assertSequenceEqual([1.0,1.0], s.min().tolist())
+        self.assertSequenceEqual([3.0,3.0], s.max().tolist())
+        self.assertSequenceEqual([1.0,1.0], s.sampleStdev().tolist())
+
+
 if __name__ == "__main__":
     if not _have_scipy:
         print "NOTE: Skipping SciPy tests as it does not seem to be installed"
+    if not _have_numpy:
+        print "NOTE: Skipping NumPy tests as it does not seem to be installed"
     unittest.main()
     if not _have_scipy:
         print "NOTE: SciPy tests were skipped as it does not seem to be installed"
+    if not _have_numpy:
+        print "NOTE: NumPy tests were skipped as it does not seem to be installed"
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index f43210c6c0301..2770f63059853 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -57,8 +57,8 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
-        num_python_includes =  read_int(infile)
+        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
+        num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
             sys.path.append(os.path.join(spark_files_dir, filename))
@@ -75,14 +75,19 @@ def main(infile, outfile):
         init_time = time.time()
         iterator = deserializer.load_stream(infile)
         serializer.dump_stream(func(split_index, iterator), outfile)
-    except Exception as e:
-        # Write the error to stderr in addition to trying to pass it back to
-        # Java, in case it happened while serializing a record
-        print >> sys.stderr, "PySpark worker failed with exception:"
-        print >> sys.stderr, traceback.format_exc()
-        write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
-        write_with_length(traceback.format_exc(), outfile)
-        sys.exit(-1)
+    except Exception:
+        try:
+            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
+            write_with_length(traceback.format_exc(), outfile)
+            outfile.flush()
+        except IOError:
+            # JVM close the socket
+            pass
+        except Exception:
+            # Write the error to stderr if it happened while serializing
+            print >> sys.stderr, "PySpark worker failed with exception:"
+            print >> sys.stderr, traceback.format_exc()
+        exit(-1)
     finish_time = time.time()
     report_times(outfile, boot_time, init_time, finish_time)
     # Mark the beginning of the accumulators section of the output
diff --git a/python/run-tests b/python/run-tests
index 9282aa47e8375..48feba2f5bd63 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -61,14 +61,17 @@ run_test "pyspark/broadcast.py"
 run_test "pyspark/accumulators.py"
 run_test "pyspark/serializers.py"
 unset PYSPARK_DOC_TEST
+run_test "pyspark/shuffle.py"
 run_test "pyspark/tests.py"
 run_test "pyspark/mllib/_common.py"
 run_test "pyspark/mllib/classification.py"
 run_test "pyspark/mllib/clustering.py"
 run_test "pyspark/mllib/linalg.py"
+run_test "pyspark/mllib/random.py"
 run_test "pyspark/mllib/recommendation.py"
 run_test "pyspark/mllib/regression.py"
 run_test "pyspark/mllib/tests.py"
+run_test "pyspark/mllib/util.py"
 
 if [[ $FAILED == 0 ]]; then
     echo -en "\033[32m"  # Green
diff --git a/repl/pom.xml b/repl/pom.xml
index 4a66408ef3d2d..68f4504450778 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -32,6 +32,7 @@
   <url>http://spark.apache.org/</url>
 
   <properties>
+    <sbt.project.name>repl</sbt.project.name>
     <deb.install.path>/usr/share/spark</deb.install.path>
     <deb.user>root</deb.user>
   </properties>
@@ -54,6 +55,12 @@
       <version>${project.version}</version>
       <scope>runtime</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-server</artifactId>
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index e1db4d5395ab9..65788f4646d91 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -230,6 +230,20 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
       case xs       => xs find (_.name == cmd)
     }
   }
+  private var fallbackMode = false 
+
+  private def toggleFallbackMode() {
+    val old = fallbackMode
+    fallbackMode = !old
+    System.setProperty("spark.repl.fallback", fallbackMode.toString)
+    echo(s"""
+      |Switched ${if (old) "off" else "on"} fallback mode without restarting.
+      |       If you have defined classes in the repl, it would 
+      |be good to redefine them incase you plan to use them. If you still run
+      |into issues it would be good to restart the repl and turn on `:fallback` 
+      |mode as first command.
+      """.stripMargin)
+  }
 
   /** Show the history */
   lazy val historyCommand = new LoopCommand("history", "show the history (optional num is commands to show)") {
@@ -299,6 +313,9 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     nullary("reset", "reset the repl to its initial state, forgetting all session entries", resetCommand),
     shCommand,
     nullary("silent", "disable/enable automatic printing of results", verbosity),
+    nullary("fallback", """
+                           |disable/enable advanced repl changes, these fix some issues but may introduce others. 
+                           |This mode will be removed once these fixes stablize""".stripMargin, toggleFallbackMode),
     cmd("type", "[-v] <expr>", "display the type of an expression without evaluating it", typeCommand),
     nullary("warnings", "show the suppressed warnings from the most recent line which had any", warningsCommand)
   )
@@ -557,29 +574,27 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     if (isReplPower) powerCommands else Nil
   )*/
 
-  val replayQuestionMessage =
+  private val replayQuestionMessage =
     """|That entry seems to have slain the compiler.  Shall I replay
        |your session? I can re-run each line except the last one.
        |[y/n]
     """.trim.stripMargin
 
-  private val crashRecovery: PartialFunction[Throwable, Boolean] = {
-    case ex: Throwable =>
-      echo(intp.global.throwableAsString(ex))
-
-      ex match {
-        case _: NoSuchMethodError | _: NoClassDefFoundError =>
-          echo("\nUnrecoverable error.")
-          throw ex
-        case _  =>
-          def fn(): Boolean =
-            try in.readYesOrNo(replayQuestionMessage, { echo("\nYou must enter y or n.") ; fn() })
-            catch { case _: RuntimeException => false }
-
-          if (fn()) replay()
-          else echo("\nAbandoning crashed session.")
-      }
-      true
+  private def crashRecovery(ex: Throwable): Boolean = {
+    echo(ex.toString)
+    ex match {
+      case _: NoSuchMethodError | _: NoClassDefFoundError =>
+        echo("\nUnrecoverable error.")
+        throw ex
+      case _  =>
+        def fn(): Boolean =
+          try in.readYesOrNo(replayQuestionMessage, { echo("\nYou must enter y or n.") ; fn() })
+          catch { case _: RuntimeException => false }
+
+        if (fn()) replay()
+        else echo("\nAbandoning crashed session.")
+    }
+    true
   }
 
   /** The main read-eval-print loop for the repl.  It calls
@@ -605,7 +620,10 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
       }
     }
     def innerLoop() {
-      if ( try processLine(readOneLine()) catch crashRecovery )
+      val shouldContinue = try {
+        processLine(readOneLine())
+      } catch {case t: Throwable => crashRecovery(t)}
+      if (shouldContinue)
         innerLoop()
     }
     innerLoop()
@@ -951,9 +969,6 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     if (execUri != null) {
       conf.set("spark.executor.uri", execUri)
     }
-    if (System.getenv("SPARK_HOME") != null) {
-      conf.setSparkHome(System.getenv("SPARK_HOME"))
-    }
     sparkContext = new SparkContext(conf)
     logInfo("Created spark context..")
     sparkContext
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 3842c291d0b7b..f60bbb4662af1 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -892,11 +892,16 @@ import org.apache.spark.util.Utils
     def definedTypeSymbol(name: String) = definedSymbols(newTypeName(name))
     def definedTermSymbol(name: String) = definedSymbols(newTermName(name))
 
+    val definedClasses = handlers.exists {
+      case _: ClassHandler => true
+      case _ => false
+    }
+
     /** Code to import bound names from previous lines - accessPath is code to
      * append to objectName to access anything bound by request.
      */
     val SparkComputedImports(importsPreamble, importsTrailer, accessPath) =
-      importsCode(referencedNames.toSet)
+      importsCode(referencedNames.toSet, definedClasses)
 
     /** Code to access a variable with the specified name */
     def fullPath(vname: String) = {
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkImports.scala b/repl/src/main/scala/org/apache/spark/repl/SparkImports.scala
index bce5c74b9d0da..193a42dcded12 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkImports.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkImports.scala
@@ -108,8 +108,9 @@ trait SparkImports {
    * last one imported is actually usable.
    */
   case class SparkComputedImports(prepend: String, append: String, access: String)
+  def fallback = System.getProperty("spark.repl.fallback", "false").toBoolean
 
-  protected def importsCode(wanted: Set[Name]): SparkComputedImports = {
+  protected def importsCode(wanted: Set[Name], definedClass: Boolean): SparkComputedImports = {
     /** Narrow down the list of requests from which imports
      *  should be taken.  Removes requests which cannot contribute
      *  useful imports for the specified set of wanted names.
@@ -124,8 +125,14 @@ trait SparkImports {
         // Single symbol imports might be implicits! See bug #1752.  Rather than
         // try to finesse this, we will mimic all imports for now.
         def keepHandler(handler: MemberHandler) = handler match {
-          case _: ImportHandler => true
-          case x                => x.definesImplicit || (x.definedNames exists wanted)
+       /* This case clause tries to "precisely" import only what is required. And in this
+        * it may miss out on some implicits, because implicits are not known in `wanted`. Thus 
+        * it is suitable for defining classes. AFAIK while defining classes implicits are not
+        * needed.*/
+          case h: ImportHandler if definedClass && !fallback => 
+            h.importedNames.exists(x => wanted.contains(x))
+          case _: ImportHandler  => true
+          case x                 => x.definesImplicit || (x.definedNames exists wanted)
         }
 
         reqs match {
@@ -182,7 +189,7 @@ trait SparkImports {
         // ambiguity errors will not be generated. Also, quote
         // the name of the variable, so that we don't need to
         // handle quoting keywords separately.
-        case x: ClassHandler =>
+        case x: ClassHandler if !fallback =>
         // I am trying to guess if the import is a defined class
         // This is an ugly hack, I am not 100% sure of the consequences.
         // Here we, let everything but "defined classes" use the import with val.
@@ -197,7 +204,7 @@ trait SparkImports {
           for (imv <- x.definedNames) {
             if (currentImps contains imv) addWrapper()
             val objName = req.lineRep.readPath
-            val valName = "$VAL" + req.lineRep.lineId
+            val valName = "$VAL" + newValId()
 
             if(!code.toString.endsWith(".`" + imv + "`;\n")) { // Which means already imported
                 code.append("val " + valName + " = " + objName + ".INSTANCE;\n")
@@ -222,4 +229,10 @@ trait SparkImports {
   private def membersAtPickler(sym: Symbol): List[Symbol] =
     beforePickler(sym.info.nonPrivateMembers.toList)
 
+  private var curValId = 0
+
+  private def newValId(): Int = {
+    curValId += 1
+    curValId
+  }
 }
diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index f2aa42dbcb4fc..c8763eb277052 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -235,7 +235,7 @@ class ReplSuite extends FunSuite {
     assertContains("res4: Array[Int] = Array(0, 0, 0, 0, 0)", output)
   }
 
-  test("SPARK-1199-simple-reproduce") {
+  test("SPARK-1199 two instances of same class don't type check.") {
     val output = runInterpreter("local-cluster[1,1,512]",
       """
         |case class Sum(exp: String, exp2: String)
@@ -247,6 +247,42 @@ class ReplSuite extends FunSuite {
     assertDoesNotContain("Exception", output)
   }
 
+  test("SPARK-2452 compound statements.") {
+    val output = runInterpreter("local",
+      """
+        |val x = 4 ; def f() = x
+        |f()
+      """.stripMargin)
+    assertDoesNotContain("error:", output)
+    assertDoesNotContain("Exception", output)
+  }
+
+  test("SPARK-2576 importing SQLContext.createSchemaRDD.") {
+    // We need to use local-cluster to test this case.
+    val output = runInterpreter("local-cluster[1,1,512]",
+      """
+        |val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+        |import sqlContext.createSchemaRDD
+        |case class TestCaseClass(value: Int)
+        |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).toSchemaRDD.collect
+      """.stripMargin)
+    assertDoesNotContain("error:", output)
+    assertDoesNotContain("Exception", output)
+  }
+
+  test("SPARK-2632 importing a method from non serializable class and not using it.") {
+    val output = runInterpreter("local",
+    """
+      |class TestClass() { def testMethod = 3 }
+      |val t = new TestClass
+      |import t.testMethod
+      |case class TestCaseClass(value: Int)
+      |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).collect
+    """.stripMargin)
+    assertDoesNotContain("error:", output)
+    assertDoesNotContain("Exception", output)
+  }
+
   if (System.getenv("MESOS_NATIVE_LIBRARY") != null) {
     test("running on Mesos") {
       val output = runInterpreter("localquiet",
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index 147b506dd5ca3..5c87da5815b64 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -36,4 +36,4 @@ export SPARK_HOME=${SPARK_PREFIX}
 export SPARK_CONF_DIR="$SPARK_HOME/conf"
 # Add the PySpark classes to the PYTHONPATH:
 export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
-export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
+export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
diff --git a/sbin/spark-executor b/sbin/spark-executor
index 336549f29c9ce..3621321a9bc8d 100755
--- a/sbin/spark-executor
+++ b/sbin/spark-executor
@@ -20,7 +20,7 @@
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
 export PYTHONPATH=$FWDIR/python:$PYTHONPATH
-export PYTHONPATH=$FWDIR/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
+export PYTHONPATH=$FWDIR/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
 
 echo "Running spark-executor with framework dir = $FWDIR"
 exec $FWDIR/bin/spark-class org.apache.spark.executor.MesosExecutorBackend
diff --git a/sbin/start-thriftserver.sh b/sbin/start-thriftserver.sh
new file mode 100755
index 0000000000000..8398e6f19b511
--- /dev/null
+++ b/sbin/start-thriftserver.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Shell script for starting the Spark SQL Thrift server
+
+# Enter posix mode for bash
+set -o posix
+
+# Figure out where Spark is installed
+FWDIR="$(cd `dirname $0`/..; pwd)"
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+  echo "Usage: ./sbin/start-thriftserver [options]"
+  $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+  exit 0
+fi
+
+CLASS="org.apache.spark.sql.hive.thriftserver.HiveThriftServer2"
+exec "$FWDIR"/bin/spark-submit --class $CLASS spark-internal $@
diff --git a/sbt/sbt b/sbt/sbt
index 9de265bd07dcb..1b1aa1483a829 100755
--- a/sbt/sbt
+++ b/sbt/sbt
@@ -72,6 +72,7 @@ Usage: $script_name [options]
   -J-X               pass option -X directly to the java runtime
                      (-J is stripped)
   -S-X               add -X to sbt's scalacOptions (-J is stripped)
+  -PmavenProfiles     Enable a maven profile for the build.
 
 In the case of duplicated or conflicting options, the order above
 shows precedence: JAVA_OPTS lowest, command line options highest.
diff --git a/sbt/sbt-launch-lib.bash b/sbt/sbt-launch-lib.bash
index 64e40a88206be..c91fecf024ad4 100755
--- a/sbt/sbt-launch-lib.bash
+++ b/sbt/sbt-launch-lib.bash
@@ -16,6 +16,7 @@ declare -a residual_args
 declare -a java_args
 declare -a scalac_args
 declare -a sbt_commands
+declare -a maven_profiles
 
 if test -x "$JAVA_HOME/bin/java"; then
     echo -e "Using $JAVA_HOME as default JAVA_HOME."
@@ -87,6 +88,13 @@ addJava () {
   dlog "[addJava] arg = '$1'"
   java_args=( "${java_args[@]}" "$1" )
 }
+
+enableProfile () {
+  dlog "[enableProfile] arg = '$1'"
+  maven_profiles=( "${maven_profiles[@]}" "$1" )
+  export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
+}
+
 addSbt () {
   dlog "[addSbt] arg = '$1'"
   sbt_commands=( "${sbt_commands[@]}" "$1" )
@@ -141,7 +149,8 @@ process_args () {
      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
 
             -D*) addJava "$1" && shift ;;
-            -J*) addJava "${1:2}" && shift ;;
+            -J*) addJava "${1:2}" && shift ;; 
+            -P*) enableProfile "$1" && shift ;;
               *) addResidual "$1" && shift ;;
     esac
   done
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 01d7b569080ea..58d44e7923bee 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -31,22 +31,29 @@
   <packaging>jar</packaging>
   <name>Spark Project Catalyst</name>
   <url>http://spark.apache.org/</url>
+  <properties>
+    <sbt.project.name>catalyst</sbt.project.name>
+  </properties>
 
   <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-compiler</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-reflect</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.scalamacros</groupId>
+      <artifactId>quasiquotes_${scala.binary.version}</artifactId>
+      <version>${scala.macros.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>com.typesafe</groupId>
-      <artifactId>scalalogging-slf4j_${scala.binary.version}</artifactId>
-      <version>1.0.1</version>
-    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 5a55be1e51558..0d26b52a84695 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -85,6 +85,26 @@ object ScalaReflection {
     case t if t <:< definitions.BooleanTpe => Schema(BooleanType, nullable = false)
   }
 
+  def typeOfObject: PartialFunction[Any, DataType] = {
+    // The data type can be determined without ambiguity.
+    case obj: BooleanType.JvmType => BooleanType
+    case obj: BinaryType.JvmType => BinaryType
+    case obj: StringType.JvmType => StringType
+    case obj: ByteType.JvmType => ByteType
+    case obj: ShortType.JvmType => ShortType
+    case obj: IntegerType.JvmType => IntegerType
+    case obj: LongType.JvmType => LongType
+    case obj: FloatType.JvmType => FloatType
+    case obj: DoubleType.JvmType => DoubleType
+    case obj: DecimalType.JvmType => DecimalType
+    case obj: TimestampType.JvmType => TimestampType
+    case null => NullType
+    // For other cases, there is no obvious mapping from the type of the given object to a
+    // Catalyst data type. A user should provide his/her specific rules
+    // (in a user-defined PartialFunction) to infer the Catalyst data type for other types of
+    // objects and then compose the user-defined PartialFunction with this one.
+  }
+
   implicit class CaseClassRelation[A <: Product : TypeTag](data: Seq[A]) {
 
     /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index ecb11129557bd..2c73a80f64ebf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -118,8 +118,10 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
   protected val UNCACHE = Keyword("UNCACHE")
   protected val UNION = Keyword("UNION")
   protected val WHERE = Keyword("WHERE")
+  protected val INTERSECT = Keyword("INTERSECT")
   protected val EXCEPT = Keyword("EXCEPT")
-
+  protected val SUBSTR = Keyword("SUBSTR")
+  protected val SUBSTRING = Keyword("SUBSTRING")
 
   // Use reflection to find the reserved words defined in this class.
   protected val reservedWords =
@@ -140,6 +142,7 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
   protected lazy val query: Parser[LogicalPlan] = (
     select * (
         UNION ~ ALL ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) } |
+        INTERSECT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) } |
         EXCEPT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)} |
         UNION ~ opt(DISTINCT) ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Distinct(Union(q1, q2)) }
       )
@@ -207,21 +210,21 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
     } |
     "(" ~> query ~ ")" ~ opt(AS) ~ ident ^^ { case s ~ _ ~ _ ~ a => Subquery(a, s) }
 
-   protected lazy val joinedRelation: Parser[LogicalPlan] =
-     relationFactor ~ opt(joinType) ~ JOIN ~ relationFactor ~ opt(joinConditions) ^^ {
+  protected lazy val joinedRelation: Parser[LogicalPlan] =
+    relationFactor ~ opt(joinType) ~ JOIN ~ relationFactor ~ opt(joinConditions) ^^ {
       case r1 ~ jt ~ _ ~ r2 ~ cond =>
         Join(r1, r2, joinType = jt.getOrElse(Inner), cond)
-     }
+    }
 
-   protected lazy val joinConditions: Parser[Expression] =
-     ON ~> expression
+  protected lazy val joinConditions: Parser[Expression] =
+    ON ~> expression
 
-   protected lazy val joinType: Parser[JoinType] =
-     INNER ^^^ Inner |
-     LEFT ~ SEMI ^^^ LeftSemi |
-     LEFT ~ opt(OUTER) ^^^ LeftOuter |
-     RIGHT ~ opt(OUTER) ^^^ RightOuter |
-     FULL ~ opt(OUTER) ^^^ FullOuter
+  protected lazy val joinType: Parser[JoinType] =
+    INNER ^^^ Inner |
+    LEFT ~ SEMI ^^^ LeftSemi |
+    LEFT ~ opt(OUTER) ^^^ LeftOuter |
+    RIGHT ~ opt(OUTER) ^^^ RightOuter |
+    FULL ~ opt(OUTER) ^^^ FullOuter
 
   protected lazy val filter: Parser[Expression] = WHERE ~ expression ^^ { case _ ~ e => e }
 
@@ -314,6 +317,12 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
     IF ~> "(" ~> expression ~ "," ~ expression ~ "," ~ expression <~ ")" ^^ {
       case c ~ "," ~ t ~ "," ~ f => If(c,t,f)
     } |
+    (SUBSTR | SUBSTRING) ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ {
+      case s ~ "," ~ p => Substring(s,p,Literal(Integer.MAX_VALUE))
+    } |
+    (SUBSTR | SUBSTRING) ~> "(" ~> expression ~ "," ~ expression ~ "," ~ expression <~ ")" ^^ {
+      case s ~ "," ~ p ~ "," ~ l => Substring(s,p,l)
+    } |
     ident ~ "(" ~ repsep(expression, ",") <~ ")" ^^ {
       case udfName ~ _ ~ exprs => UnresolvedFunction(udfName, exprs)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index c7188469bfb86..2ba68cab115fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 
-
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
  * when all relations are already filled in and the analyser needs only to resolve attribute
@@ -54,6 +53,7 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
       StarExpansion ::
       ResolveFunctions ::
       GlobalAggregates ::
+      UnresolvedHavingClauseAttributes :: 
       typeCoercionRules :_*),
     Batch("Check Analysis", Once,
       CheckResolution),
@@ -109,12 +109,12 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
   object ResolveReferences extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case q: LogicalPlan if q.childrenResolved =>
-        logger.trace(s"Attempting to resolve ${q.simpleString}")
+        logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressions {
           case u @ UnresolvedAttribute(name) =>
             // Leave unchanged if resolution fails.  Hopefully will be resolved next round.
             val result = q.resolve(name).getOrElse(u)
-            logger.debug(s"Resolving $u to $result")
+            logDebug(s"Resolving $u to $result")
             result
         }
     }
@@ -151,6 +151,31 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
     }
   }
 
+  /**
+   * This rule finds expressions in HAVING clause filters that depend on
+   * unresolved attributes.  It pushes these expressions down to the underlying
+   * aggregates and then projects them away above the filter.
+   */
+  object UnresolvedHavingClauseAttributes extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
+      case filter @ Filter(havingCondition, aggregate @ Aggregate(_, originalAggExprs, _)) 
+          if aggregate.resolved && containsAggregate(havingCondition) => {
+        val evaluatedCondition = Alias(havingCondition,  "havingCondition")()
+        val aggExprsWithHaving = evaluatedCondition +: originalAggExprs
+        
+        Project(aggregate.output,
+          Filter(evaluatedCondition.toAttribute,
+            aggregate.copy(aggregateExpressions = aggExprsWithHaving)))
+      }
+      
+    }
+    
+    protected def containsAggregate(condition: Expression): Boolean =
+      condition
+        .collect { case ae: AggregateExpression => ae }
+        .nonEmpty
+  }
+
   /**
    * When a SELECT clause has only a single expression and that expression is a
    * [[catalyst.expressions.Generator Generator]] we convert the
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index f30b5d816703a..616f1e2ecb60f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -25,6 +25,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery}
  * An interface for looking up relations by name.  Used by an [[Analyzer]].
  */
 trait Catalog {
+
+  def caseSensitive: Boolean
+
   def lookupRelation(
     databaseName: Option[String],
     tableName: String,
@@ -35,22 +38,44 @@ trait Catalog {
   def unregisterTable(databaseName: Option[String], tableName: String): Unit
 
   def unregisterAllTables(): Unit
+
+  protected def processDatabaseAndTableName(
+      databaseName: Option[String],
+      tableName: String): (Option[String], String) = {
+    if (!caseSensitive) {
+      (databaseName.map(_.toLowerCase), tableName.toLowerCase)
+    } else {
+      (databaseName, tableName)
+    }
+  }
+
+  protected def processDatabaseAndTableName(
+      databaseName: String,
+      tableName: String): (String, String) = {
+    if (!caseSensitive) {
+      (databaseName.toLowerCase, tableName.toLowerCase)
+    } else {
+      (databaseName, tableName)
+    }
+  }
 }
 
-class SimpleCatalog extends Catalog {
+class SimpleCatalog(val caseSensitive: Boolean) extends Catalog {
   val tables = new mutable.HashMap[String, LogicalPlan]()
 
   override def registerTable(
       databaseName: Option[String],
       tableName: String,
       plan: LogicalPlan): Unit = {
-    tables += ((tableName, plan))
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    tables += ((tblName, plan))
   }
 
   override def unregisterTable(
       databaseName: Option[String],
       tableName: String) = {
-    tables -= tableName
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    tables -= tblName
   }
 
   override def unregisterAllTables() = {
@@ -61,12 +86,13 @@ class SimpleCatalog extends Catalog {
       databaseName: Option[String],
       tableName: String,
       alias: Option[String] = None): LogicalPlan = {
-    val table = tables.get(tableName).getOrElse(sys.error(s"Table Not Found: $tableName"))
-    val tableWithQualifiers = Subquery(tableName, table)
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    val table = tables.get(tblName).getOrElse(sys.error(s"Table Not Found: $tableName"))
+    val tableWithQualifiers = Subquery(tblName, table)
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
     // properly qualified with this alias.
-    alias.map(a => Subquery(a.toLowerCase, tableWithQualifiers)).getOrElse(tableWithQualifiers)
+    alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
   }
 }
 
@@ -85,26 +111,29 @@ trait OverrideCatalog extends Catalog {
     databaseName: Option[String],
     tableName: String,
     alias: Option[String] = None): LogicalPlan = {
-
-    val overriddenTable = overrides.get((databaseName, tableName))
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    val overriddenTable = overrides.get((dbName, tblName))
+    val tableWithQualifers = overriddenTable.map(r => Subquery(tblName, r))
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
     // properly qualified with this alias.
     val withAlias =
-      overriddenTable.map(r => alias.map(a => Subquery(a.toLowerCase, r)).getOrElse(r))
+      tableWithQualifers.map(r => alias.map(a => Subquery(a, r)).getOrElse(r))
 
-    withAlias.getOrElse(super.lookupRelation(databaseName, tableName, alias))
+    withAlias.getOrElse(super.lookupRelation(dbName, tblName, alias))
   }
 
   override def registerTable(
       databaseName: Option[String],
       tableName: String,
       plan: LogicalPlan): Unit = {
-    overrides.put((databaseName, tableName), plan)
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    overrides.put((dbName, tblName), plan)
   }
 
   override def unregisterTable(databaseName: Option[String], tableName: String): Unit = {
-    overrides.remove((databaseName, tableName))
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    overrides.remove((dbName, tblName))
   }
 
   override def unregisterAllTables(): Unit = {
@@ -117,6 +146,9 @@ trait OverrideCatalog extends Catalog {
  * relations are already filled in and the analyser needs only to resolve attribute references.
  */
 object EmptyCatalog extends Catalog {
+
+  val caseSensitive: Boolean = true
+
   def lookupRelation(
     databaseName: Option[String],
     tableName: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index c0255701b7ba5..760c49fbca4a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -18,17 +18,49 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.catalyst.expressions.Expression
+import scala.collection.mutable
 
 /** A catalog for looking up user defined functions, used by an [[Analyzer]]. */
 trait FunctionRegistry {
+  type FunctionBuilder = Seq[Expression] => Expression
+
+  def registerFunction(name: String, builder: FunctionBuilder): Unit
+
   def lookupFunction(name: String, children: Seq[Expression]): Expression
 }
 
+trait OverrideFunctionRegistry extends FunctionRegistry {
+
+  val functionBuilders = new mutable.HashMap[String, FunctionBuilder]()
+
+  def registerFunction(name: String, builder: FunctionBuilder) = {
+    functionBuilders.put(name, builder)
+  }
+
+  abstract override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
+    functionBuilders.get(name).map(_(children)).getOrElse(super.lookupFunction(name,children))
+  }
+}
+
+class SimpleFunctionRegistry extends FunctionRegistry {
+  val functionBuilders = new mutable.HashMap[String, FunctionBuilder]()
+
+  def registerFunction(name: String, builder: FunctionBuilder) = {
+    functionBuilders.put(name, builder)
+  }
+
+  override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
+    functionBuilders(name)(children)
+  }
+}
+
 /**
  * A trivial catalog that returns an error when a function is requested.  Used for testing when all
  * functions are already filled in and the analyser needs only to resolve attribute references.
  */
 object EmptyFunctionRegistry extends FunctionRegistry {
+  def registerFunction(name: String, builder: FunctionBuilder) = ???
+
   def lookupFunction(name: String, children: Seq[Expression]): Expression = {
     throw new UnsupportedOperationException
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 76ddeba9cb312..e94f2a3bea63e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -50,6 +50,7 @@ trait HiveTypeCoercion {
     StringToIntegralCasts ::
     FunctionArgumentConversion ::
     CastNulls ::
+    Division ::
     Nil
 
   /**
@@ -74,7 +75,7 @@ trait HiveTypeCoercion {
             // Leave the same if the dataTypes match.
             case Some(newType) if a.dataType == newType.dataType => a
             case Some(newType) =>
-              logger.debug(s"Promoting $a to $newType in ${q.simpleString}}")
+              logDebug(s"Promoting $a to $newType in ${q.simpleString}}")
               newType
           }
       }
@@ -153,7 +154,7 @@ trait HiveTypeCoercion {
             (Alias(Cast(l, StringType), l.name)(), r)
 
           case (l, r) if l.dataType != r.dataType =>
-            logger.debug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
+            logDebug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
             findTightestCommonType(l.dataType, r.dataType).map { widestType =>
               val newLeft =
                 if (l.dataType == widestType) l else Alias(Cast(l, widestType), l.name)()
@@ -169,7 +170,7 @@ trait HiveTypeCoercion {
 
         val newLeft =
           if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-            logger.debug(s"Widening numeric types in union $castedLeft ${left.output}")
+            logDebug(s"Widening numeric types in union $castedLeft ${left.output}")
             Project(castedLeft, left)
           } else {
             left
@@ -177,7 +178,7 @@ trait HiveTypeCoercion {
 
         val newRight =
           if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-            logger.debug(s"Widening numeric types in union $castedRight ${right.output}")
+            logDebug(s"Widening numeric types in union $castedRight ${right.output}")
             Project(castedRight, right)
           } else {
             right
@@ -231,11 +232,23 @@ trait HiveTypeCoercion {
    * Changes Boolean values to Bytes so that expressions like true < false can be Evaluated.
    */
   object BooleanComparisons extends Rule[LogicalPlan] {
+    val trueValues = Seq(1, 1L, 1.toByte, 1.toShort, BigDecimal(1)).map(Literal(_))
+    val falseValues = Seq(0, 0L, 0.toByte, 0.toShort, BigDecimal(0)).map(Literal(_))
+
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
-      // No need to change EqualTo operators as that actually makes sense for boolean types.
+
+      // Hive treats (true = 1) as true and (false = 0) as true.
+      case EqualTo(l @ BooleanType(), r) if trueValues.contains(r) => l
+      case EqualTo(l, r @ BooleanType()) if trueValues.contains(l) => r
+      case EqualTo(l @ BooleanType(), r) if falseValues.contains(r) => Not(l)
+      case EqualTo(l, r @ BooleanType()) if falseValues.contains(l) => Not(r)
+
+      // No need to change other EqualTo operators as that actually makes sense for boolean types.
       case e: EqualTo => e
+      // No need to change the EqualNullSafe operators, too
+      case e: EqualNullSafe => e
       // Otherwise turn them to Byte types so that there exists and ordering.
       case p: BinaryComparison
           if p.left.dataType == BooleanType && p.right.dataType == BooleanType =>
@@ -305,6 +318,23 @@ trait HiveTypeCoercion {
     }
   }
 
+  /**
+   * Hive only performs integral division with the DIV operator. The arguments to / are always
+   * converted to fractional types.
+   */
+  object Division extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+      // Skip nodes who's children have not been resolved yet.
+      case e if !e.childrenResolved => e
+
+      // Decimal and Double remain the same
+      case d: Divide if d.dataType == DoubleType => d
+      case d: Divide if d.dataType == DecimalType => d
+
+      case Divide(l, r) => Divide(Cast(l, DoubleType), Cast(r, DoubleType))
+    }
+  }
+
   /**
    * Ensures that NullType gets casted to some other types under certain circumstances.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
index a6ce90854dcb4..22941edef2d46 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
  * of itself with globally unique expression ids.
  */
 trait MultiInstanceRelation {
-  def newInstance: this.type
+  def newInstance(): this.type
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 7abeb032964e1..a0e25775da6dd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.{errors, trees}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.BaseRelation
+import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 /**
@@ -36,7 +36,7 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str
 case class UnresolvedRelation(
     databaseName: Option[String],
     tableName: String,
-    alias: Option[String] = None) extends BaseRelation {
+    alias: Option[String] = None) extends LeafNode {
   override def output = Nil
   override lazy val resolved = false
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 1b503b957d146..f44521d6381c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -77,10 +77,27 @@ package object dsl {
     def > (other: Expression) = GreaterThan(expr, other)
     def >= (other: Expression) = GreaterThanOrEqual(expr, other)
     def === (other: Expression) = EqualTo(expr, other)
+    def <=> (other: Expression) = EqualNullSafe(expr, other)
     def !== (other: Expression) = Not(EqualTo(expr, other))
 
+    def in(list: Expression*) = In(expr, list)
+
     def like(other: Expression) = Like(expr, other)
     def rlike(other: Expression) = RLike(expr, other)
+    def contains(other: Expression) = Contains(expr, other)
+    def startsWith(other: Expression) = StartsWith(expr, other)
+    def endsWith(other: Expression) = EndsWith(expr, other)
+    def substr(pos: Expression, len: Expression = Literal(Int.MaxValue)) =
+      Substring(expr, pos, len)
+    def substring(pos: Expression, len: Expression = Literal(Int.MaxValue)) =
+      Substring(expr, pos, len)
+
+    def isNull = IsNull(expr)
+    def isNotNull = IsNotNull(expr)
+
+    def getItem(ordinal: Expression) = GetItem(expr, ordinal)
+    def getField(fieldName: String) = GetField(expr, fieldName)
+
     def cast(to: DataType) = Cast(expr, to)
 
     def asc = SortOrder(expr, Ascending)
@@ -112,6 +129,7 @@ package object dsl {
     def sumDistinct(e: Expression) = SumDistinct(e)
     def count(e: Expression) = Count(e)
     def countDistinct(e: Expression*) = CountDistinct(e)
+    def approxCountDistinct(e: Expression, rsd: Double = 0.05) = ApproxCountDistinct(e, rsd)
     def avg(e: Expression) = Average(e)
     def first(e: Expression) = First(e)
     def min(e: Expression) = Min(e)
@@ -163,6 +181,18 @@ package object dsl {
 
       /** Creates a new AttributeReference of type binary */
       def binary = AttributeReference(s, BinaryType, nullable = true)()
+
+      /** Creates a new AttributeReference of type array */
+      def array(dataType: DataType) = AttributeReference(s, ArrayType(dataType), nullable = true)()
+
+      /** Creates a new AttributeReference of type map */
+      def map(keyType: DataType, valueType: DataType): AttributeReference =
+        map(MapType(keyType, valueType))
+      def map(mapType: MapType) = AttributeReference(s, mapType, nullable = true)()
+
+      /** Creates a new AttributeReference of type struct */
+      def struct(fields: StructField*): AttributeReference = struct(StructType(fields))
+      def struct(structType: StructType) = AttributeReference(s, structType, nullable = true)()
     }
 
     implicit class DslAttribute(a: AttributeReference) {
@@ -172,7 +202,7 @@ package object dsl {
       // Protobuf terminology
       def required = a.withNullability(false)
 
-      def at(ordinal: Int) = BoundReference(ordinal, a)
+      def at(ordinal: Int) = BoundReference(ordinal, a.dataType, a.nullable)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 9ce1f01056462..0913f15888780 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -17,72 +17,37 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.trees
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.Logging
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.catalyst.trees
 
 /**
  * A bound reference points to a specific slot in the input tuple, allowing the actual value
  * to be retrieved more efficiently.  However, since operations like column pruning can change
  * the layout of intermediate tuples, BindReferences should be run after all such transformations.
  */
-case class BoundReference(ordinal: Int, baseReference: Attribute)
-  extends Attribute with trees.LeafNode[Expression] {
+case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
+  extends Expression with trees.LeafNode[Expression] {
 
   type EvaluatedType = Any
 
-  override def nullable = baseReference.nullable
-  override def dataType = baseReference.dataType
-  override def exprId = baseReference.exprId
-  override def qualifiers = baseReference.qualifiers
-  override def name = baseReference.name
-
-  override def newInstance = BoundReference(ordinal, baseReference.newInstance)
-  override def withNullability(newNullability: Boolean) =
-    BoundReference(ordinal, baseReference.withNullability(newNullability))
-  override def withQualifiers(newQualifiers: Seq[String]) =
-    BoundReference(ordinal, baseReference.withQualifiers(newQualifiers))
+  override def references = Set.empty
 
-  override def toString = s"$baseReference:$ordinal"
+  override def toString = s"input[$ordinal]"
 
   override def eval(input: Row): Any = input(ordinal)
 }
 
-/**
- * Used to denote operators that do their own binding of attributes internally.
- */
-trait NoBind { self: trees.TreeNode[_] => }
-
-class BindReferences[TreeNode <: QueryPlan[TreeNode]] extends Rule[TreeNode] {
-  import BindReferences._
-
-  def apply(plan: TreeNode): TreeNode = {
-    plan.transform {
-      case n: NoBind => n.asInstanceOf[TreeNode]
-      case leafNode if leafNode.children.isEmpty => leafNode
-      case unaryNode if unaryNode.children.size == 1 => unaryNode.transformExpressions { case e =>
-        bindReference(e, unaryNode.children.head.output)
-      }
-    }
-  }
-}
-
 object BindReferences extends Logging {
   def bindReference[A <: Expression](expression: A, input: Seq[Attribute]): A = {
     expression.transform { case a: AttributeReference =>
       attachTree(a, "Binding attribute") {
         val ordinal = input.indexWhere(_.exprId == a.exprId)
         if (ordinal == -1) {
-          // TODO: This fallback is required because some operators (such as ScriptTransform)
-          // produce new attributes that can't be bound.  Likely the right thing to do is remove
-          // this rule and require all operators to explicitly bind to the input schema that
-          // they specify.
-          logger.debug(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
-          a
+          sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
         } else {
-          BoundReference(ordinal, a)
+          BoundReference(ordinal, a.dataType, a.nullable)
         }
       }
     }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 1f9716e385e9e..0ad2b30cf9c1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.Timestamp
+import java.text.{DateFormat, SimpleDateFormat}
 
 import org.apache.spark.sql.catalyst.types._
 
@@ -41,6 +42,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression {
   // UDFToString
   private[this] def castToString: Any => Any = child.dataType match {
     case BinaryType => buildCast[Array[Byte]](_, new String(_, "UTF-8"))
+    case TimestampType => buildCast[Timestamp](_, timestampToString)
     case _ => buildCast[Any](_, _.toString)
   }
 
@@ -126,6 +128,18 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression {
     ts.getTime / 1000 + ts.getNanos.toDouble / 1000000000
   }
 
+  // Converts Timestamp to string according to Hive TimestampWritable convention
+  private[this] def timestampToString(ts: Timestamp): String = {
+    val timestampString = ts.toString
+    val formatted = Cast.threadLocalDateFormat.get.format(ts)
+
+    if (timestampString.length > 19 && timestampString.substring(19) != ".0") {
+      formatted + timestampString.substring(19)
+    } else {
+      formatted
+    }
+  }
+
   private[this] def castToLong: Any => Any = child.dataType match {
     case StringType =>
       buildCast[String](_, s => try s.toLong catch {
@@ -249,3 +263,12 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression {
     if (evaluated == null) null else cast(evaluated)
   }
 }
+
+object Cast {
+  // `SimpleDateFormat` is not thread-safe.
+  private[sql] val threadLocalDateFormat = new ThreadLocal[DateFormat] {
+    override def initialValue() = {
+      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 2c71d2c7b3563..8fc5896974438 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+
 /**
- * Converts a [[Row]] to another Row given a sequence of expression that define each column of the
- * new row. If the schema of the input row is specified, then the given expression will be bound to
- * that schema.
+ * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
+ * @param expressions a sequence of expressions that determine the value of each column of the
+ *                    output row.
  */
-class Projection(expressions: Seq[Expression]) extends (Row => Row) {
+class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
@@ -40,25 +41,25 @@ class Projection(expressions: Seq[Expression]) extends (Row => Row) {
 }
 
 /**
- * Converts a [[Row]] to another Row given a sequence of expression that define each column of th
- * new row. If the schema of the input row is specified, then the given expression will be bound to
- * that schema.
- *
- * In contrast to a normal projection, a MutableProjection reuses the same underlying row object
- * each time an input row is added.  This significantly reduces the cost of calculating the
- * projection, but means that it is not safe to hold on to a reference to a [[Row]] after `next()`
- * has been called on the [[Iterator]] that produced it. Instead, the user must call `Row.copy()`
- * and hold on to the returned [[Row]] before calling `next()`.
+ * A [[MutableProjection]] that is calculated by calling `eval` on each of the specified
+ * expressions.
+ * @param expressions a sequence of expressions that determine the value of each column of the
+ *                    output row.
  */
-case class MutableProjection(expressions: Seq[Expression]) extends (Row => Row) {
+case class InterpretedMutableProjection(expressions: Seq[Expression]) extends MutableProjection {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
   private[this] val exprArray = expressions.toArray
-  private[this] val mutableRow = new GenericMutableRow(exprArray.size)
+  private[this] var mutableRow: MutableRow = new GenericMutableRow(exprArray.size)
   def currentValue: Row = mutableRow
 
-  def apply(input: Row): Row = {
+  override def target(row: MutableRow): MutableProjection = {
+    mutableRow = row
+    this
+  }
+
+  override def apply(input: Row): Row = {
     var i = 0
     while (i < exprArray.length) {
       mutableRow(i) = exprArray(i).eval(input)
@@ -76,6 +77,12 @@ class JoinedRow extends Row {
   private[this] var row1: Row = _
   private[this] var row2: Row = _
 
+  def this(left: Row, right: Row) = {
+    this()
+    row1 = left
+    row2 = right
+  }
+
   /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
   def apply(r1: Row, r2: Row): Row = {
     row1 = r1
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
index 74ae723686cfe..c9a63e201ef60 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala
@@ -32,6 +32,16 @@ object Row {
    * }}}
    */
   def unapplySeq(row: Row): Some[Seq[Any]] = Some(row)
+
+  /**
+   * This method can be used to construct a [[Row]] with the given values.
+   */
+  def apply(values: Any*): Row = new GenericRow(values.toArray)
+
+  /**
+   * This method can be used to construct a [[Row]] from a [[Seq]] of values.
+   */
+  def fromSeq(values: Seq[Any]): Row = new GenericRow(values.toArray)
 }
 
 /**
@@ -88,15 +98,6 @@ trait MutableRow extends Row {
   def setByte(ordinal: Int, value: Byte)
   def setFloat(ordinal: Int, value: Float)
   def setString(ordinal: Int, value: String)
-
-  /**
-   * Experimental
-   *
-   * Returns a mutable string builder for the specified column.  A given row should return the
-   * result of any mutations made to the returned buffer next time getString is called for the same
-   * column.
-   */
-  def getStringBuilder(ordinal: Int): StringBuilder
 }
 
 /**
@@ -180,6 +181,35 @@ class GenericRow(protected[catalyst] val values: Array[Any]) extends Row {
     values(i).asInstanceOf[String]
   }
 
+  // Custom hashCode function that matches the efficient code generated version.
+  override def hashCode(): Int = {
+    var result: Int = 37
+
+    var i = 0
+    while (i < values.length) {
+      val update: Int =
+        if (isNullAt(i)) {
+          0
+        } else {
+          apply(i) match {
+            case b: Boolean => if (b) 0 else 1
+            case b: Byte => b.toInt
+            case s: Short => s.toInt
+            case i: Int => i
+            case l: Long => (l ^ (l >>> 32)).toInt
+            case f: Float => java.lang.Float.floatToIntBits(f)
+            case d: Double =>
+              val b = java.lang.Double.doubleToLongBits(d)
+              (b ^ (b >>> 32)).toInt
+            case other => other.hashCode()
+          }
+        }
+      result = 37 * result + update
+      i += 1
+    }
+    result
+  }
+
   def copy() = this
 }
 
@@ -187,8 +217,6 @@ class GenericMutableRow(size: Int) extends GenericRow(size) with MutableRow {
   /** No-arg constructor for serialization. */
   def this() = this(0)
 
-  def getStringBuilder(ordinal: Int): StringBuilder = ???
-
   override def setBoolean(ordinal: Int,value: Boolean): Unit = { values(ordinal) = value }
   override def setByte(ordinal: Int,value: Byte): Unit = { values(ordinal) = value }
   override def setDouble(ordinal: Int,value: Double): Unit = { values(ordinal) = value }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index 5e089f7618e0a..95633dd0c9870 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -27,13 +27,321 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   def references = children.flatMap(_.references).toSet
   def nullable = true
 
+  /** This method has been generated by this script
+
+    (1 to 22).map { x =>
+      val anys = (1 to x).map(x => "Any").reduce(_ + ", " + _)
+      val evals = (0 to x - 1).map(x => s"children($x).eval(input)").reduce(_ + ",\n    " + _)
+
+    s"""
+    case $x =>
+      function.asInstanceOf[($anys) => Any](
+      $evals)
+    """
+    }
+
+  */
+
+  // scalastyle:off
   override def eval(input: Row): Any = {
     children.size match {
+      case 0 => function.asInstanceOf[() => Any]()
       case 1 => function.asInstanceOf[(Any) => Any](children(0).eval(input))
       case 2 =>
         function.asInstanceOf[(Any, Any) => Any](
           children(0).eval(input),
           children(1).eval(input))
+      case 3 =>
+        function.asInstanceOf[(Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input))
+      case 4 =>
+        function.asInstanceOf[(Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input))
+      case 5 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input))
+      case 6 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input))
+      case 7 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input))
+      case 8 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input))
+      case 9 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input))
+      case 10 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input))
+      case 11 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input))
+      case 12 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input))
+      case 13 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input))
+      case 14 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input))
+      case 15 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input))
+      case 16 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input))
+      case 17 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input))
+      case 18 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input))
+      case 19 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input))
+      case 20 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input),
+          children(19).eval(input))
+      case 21 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input),
+          children(19).eval(input),
+          children(20).eval(input))
+      case 22 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input),
+          children(19).eval(input),
+          children(20).eval(input),
+          children(21).eval(input))
     }
+    // scalastyle:on
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala
index e787c59e75723..eb8898900d6a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/WrapDynamic.scala
@@ -21,8 +21,16 @@ import scala.language.dynamics
 
 import org.apache.spark.sql.catalyst.types.DataType
 
-case object DynamicType extends DataType
+/**
+ * The data type representing [[DynamicRow]] values.
+ */
+case object DynamicType extends DataType {
+  def simpleString: String = "dynamic"
+}
 
+/**
+ * Wrap a [[Row]] as a [[DynamicRow]].
+ */
 case class WrapDynamic(children: Seq[Attribute]) extends Expression {
   type EvaluatedType = DynamicRow
 
@@ -37,6 +45,11 @@ case class WrapDynamic(children: Seq[Attribute]) extends Expression {
   }
 }
 
+/**
+ * DynamicRows use scala's Dynamic trait to emulate an ORM of in a dynamically typed language.
+ * Since the type of the column is not known at compile time, all attributes are converted to
+ * strings before being passed to the function.
+ */
 class DynamicRow(val schema: Seq[Attribute], values: Array[Any])
   extends GenericRow(values) with Dynamic {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
new file mode 100644
index 0000000000000..5b398695bf560
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import com.google.common.cache.{CacheLoader, CacheBuilder}
+
+import scala.language.existentials
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.types._
+
+/**
+ * A base class for generators of byte code to perform expression evaluation.  Includes a set of
+ * helpers for referring to Catalyst types and building trees that perform evaluation of individual
+ * expressions.
+ */
+abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Logging {
+  import scala.reflect.runtime.{universe => ru}
+  import scala.reflect.runtime.universe._
+
+  import scala.tools.reflect.ToolBox
+
+  protected val toolBox = runtimeMirror(getClass.getClassLoader).mkToolBox()
+
+  protected val rowType = typeOf[Row]
+  protected val mutableRowType = typeOf[MutableRow]
+  protected val genericRowType = typeOf[GenericRow]
+  protected val genericMutableRowType = typeOf[GenericMutableRow]
+
+  protected val projectionType = typeOf[Projection]
+  protected val mutableProjectionType = typeOf[MutableProjection]
+
+  private val curId = new java.util.concurrent.atomic.AtomicInteger()
+  private val javaSeparator = "$"
+
+  /**
+   * Generates a class for a given input expression.  Called when there is not cached code
+   * already available.
+   */
+  protected def create(in: InType): OutType
+
+  /**
+   * Canonicalizes an input expression. Used to avoid double caching expressions that differ only
+   * cosmetically.
+   */
+  protected def canonicalize(in: InType): InType
+
+  /** Binds an input expression to a given input schema */
+  protected def bind(in: InType, inputSchema: Seq[Attribute]): InType
+
+  /**
+   * A cache of generated classes.
+   *
+   * From the Guava Docs: A Cache is similar to ConcurrentMap, but not quite the same. The most
+   * fundamental difference is that a ConcurrentMap persists all elements that are added to it until
+   * they are explicitly removed. A Cache on the other hand is generally configured to evict entries
+   * automatically, in order to constrain its memory footprint
+   */
+  protected val cache = CacheBuilder.newBuilder()
+    .maximumSize(1000)
+    .build(
+      new CacheLoader[InType, OutType]() {
+        override def load(in: InType): OutType = globalLock.synchronized {
+           create(in)
+        }
+      })
+
+  /** Generates the requested evaluator binding the given expression(s) to the inputSchema. */
+  def apply(expressions: InType, inputSchema: Seq[Attribute]): OutType =
+    apply(bind(expressions, inputSchema))
+
+  /** Generates the requested evaluator given already bound expression(s). */
+  def apply(expressions: InType): OutType = cache.get(canonicalize(expressions))
+
+  /**
+   * Returns a term name that is unique within this instance of a `CodeGenerator`.
+   *
+   * (Since we aren't in a macro context we do not seem to have access to the built in `freshName`
+   * function.)
+   */
+  protected def freshName(prefix: String): TermName = {
+    newTermName(s"$prefix$javaSeparator${curId.getAndIncrement}")
+  }
+
+  /**
+   * Scala ASTs for evaluating an [[Expression]] given a [[Row]] of input.
+   *
+   * @param code The sequence of statements required to evaluate the expression.
+   * @param nullTerm A term that holds a boolean value representing whether the expression evaluated
+   *                 to null.
+   * @param primitiveTerm A term for a possible primitive value of the result of the evaluation. Not
+   *                      valid if `nullTerm` is set to `false`.
+   * @param objectTerm A possibly boxed version of the result of evaluating this expression.
+   */
+  protected case class EvaluatedExpression(
+      code: Seq[Tree],
+      nullTerm: TermName,
+      primitiveTerm: TermName,
+      objectTerm: TermName)
+
+  /**
+   * Given an expression tree returns an [[EvaluatedExpression]], which contains Scala trees that
+   * can be used to determine the result of evaluating the expression on an input row.
+   */
+  def expressionEvaluator(e: Expression): EvaluatedExpression = {
+    val primitiveTerm = freshName("primitiveTerm")
+    val nullTerm = freshName("nullTerm")
+    val objectTerm = freshName("objectTerm")
+
+    implicit class Evaluate1(e: Expression) {
+      def castOrNull(f: TermName => Tree, dataType: DataType): Seq[Tree] = {
+        val eval = expressionEvaluator(e)
+        eval.code ++
+        q"""
+          val $nullTerm = ${eval.nullTerm}
+          val $primitiveTerm =
+            if($nullTerm)
+              ${defaultPrimitive(dataType)}
+            else
+              ${f(eval.primitiveTerm)}
+        """.children
+      }
+    }
+
+    implicit class Evaluate2(expressions: (Expression, Expression)) {
+
+      /**
+       * Short hand for generating binary evaluation code, which depends on two sub-evaluations of
+       * the same type.  If either of the sub-expressions is null, the result of this computation
+       * is assumed to be null.
+       *
+       * @param f a function from two primitive term names to a tree that evaluates them.
+       */
+      def evaluate(f: (TermName, TermName) => Tree): Seq[Tree] =
+        evaluateAs(expressions._1.dataType)(f)
+
+      def evaluateAs(resultType: DataType)(f: (TermName, TermName) => Tree): Seq[Tree] = {
+        // TODO: Right now some timestamp tests fail if we enforce this...
+        if (expressions._1.dataType != expressions._2.dataType) {
+          log.warn(s"${expressions._1.dataType} != ${expressions._2.dataType}")
+        }
+
+        val eval1 = expressionEvaluator(expressions._1)
+        val eval2 = expressionEvaluator(expressions._2)
+        val resultCode = f(eval1.primitiveTerm, eval2.primitiveTerm)
+
+        eval1.code ++ eval2.code ++
+        q"""
+          val $nullTerm = ${eval1.nullTerm} || ${eval2.nullTerm}
+          val $primitiveTerm: ${termForType(resultType)} =
+            if($nullTerm) {
+              ${defaultPrimitive(resultType)}
+            } else {
+              $resultCode.asInstanceOf[${termForType(resultType)}]
+            }
+        """.children : Seq[Tree]
+      }
+    }
+
+    val inputTuple = newTermName(s"i")
+
+    // TODO: Skip generation of null handling code when expression are not nullable.
+    val primitiveEvaluation: PartialFunction[Expression, Seq[Tree]] = {
+      case b @ BoundReference(ordinal, dataType, nullable) =>
+        val nullValue = q"$inputTuple.isNullAt($ordinal)"
+        q"""
+          val $nullTerm: Boolean = $nullValue
+          val $primitiveTerm: ${termForType(dataType)} =
+            if($nullTerm)
+              ${defaultPrimitive(dataType)}
+            else
+              ${getColumn(inputTuple, dataType, ordinal)}
+         """.children
+
+      case expressions.Literal(null, dataType) =>
+        q"""
+          val $nullTerm = true
+          val $primitiveTerm: ${termForType(dataType)} = null.asInstanceOf[${termForType(dataType)}]
+         """.children
+
+      case expressions.Literal(value: Boolean, dataType) =>
+        q"""
+          val $nullTerm = ${value == null}
+          val $primitiveTerm: ${termForType(dataType)} = $value
+         """.children
+
+      case expressions.Literal(value: String, dataType) =>
+        q"""
+          val $nullTerm = ${value == null}
+          val $primitiveTerm: ${termForType(dataType)} = $value
+         """.children
+
+      case expressions.Literal(value: Int, dataType) =>
+        q"""
+          val $nullTerm = ${value == null}
+          val $primitiveTerm: ${termForType(dataType)} = $value
+         """.children
+
+      case expressions.Literal(value: Long, dataType) =>
+        q"""
+          val $nullTerm = ${value == null}
+          val $primitiveTerm: ${termForType(dataType)} = $value
+         """.children
+
+      case Cast(e @ BinaryType(), StringType) =>
+        val eval = expressionEvaluator(e)
+        eval.code ++
+        q"""
+          val $nullTerm = ${eval.nullTerm}
+          val $primitiveTerm =
+            if($nullTerm)
+              ${defaultPrimitive(StringType)}
+            else
+              new String(${eval.primitiveTerm}.asInstanceOf[Array[Byte]])
+        """.children
+
+      case Cast(child @ NumericType(), IntegerType) =>
+        child.castOrNull(c => q"$c.toInt", IntegerType)
+
+      case Cast(child @ NumericType(), LongType) =>
+        child.castOrNull(c => q"$c.toLong", LongType)
+
+      case Cast(child @ NumericType(), DoubleType) =>
+        child.castOrNull(c => q"$c.toDouble", DoubleType)
+
+      case Cast(child @ NumericType(), FloatType) =>
+        child.castOrNull(c => q"$c.toFloat", IntegerType)
+
+      // Special handling required for timestamps in hive test cases since the toString function
+      // does not match the expected output.
+      case Cast(e, StringType) if e.dataType != TimestampType =>
+        val eval = expressionEvaluator(e)
+        eval.code ++
+        q"""
+          val $nullTerm = ${eval.nullTerm}
+          val $primitiveTerm =
+            if($nullTerm)
+              ${defaultPrimitive(StringType)}
+            else
+              ${eval.primitiveTerm}.toString
+        """.children
+
+      case EqualTo(e1, e2) =>
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 == $eval2" }
+
+      /* TODO: Fix null semantics.
+      case In(e1, list) if !list.exists(!_.isInstanceOf[expressions.Literal]) =>
+        val eval = expressionEvaluator(e1)
+
+        val checks = list.map {
+          case expressions.Literal(v: String, dataType) =>
+            q"if(${eval.primitiveTerm} == $v) return true"
+          case expressions.Literal(v: Int, dataType) =>
+            q"if(${eval.primitiveTerm} == $v) return true"
+        }
+
+        val funcName = newTermName(s"isIn${curId.getAndIncrement()}")
+
+        q"""
+            def $funcName: Boolean = {
+              ..${eval.code}
+              if(${eval.nullTerm}) return false
+              ..$checks
+              return false
+            }
+            val $nullTerm = false
+            val $primitiveTerm = $funcName
+        """.children
+      */
+
+      case GreaterThan(e1 @ NumericType(), e2 @ NumericType()) =>
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 > $eval2" }
+      case GreaterThanOrEqual(e1 @ NumericType(), e2 @ NumericType()) =>
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 >= $eval2" }
+      case LessThan(e1 @ NumericType(), e2 @ NumericType()) =>
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 < $eval2" }
+      case LessThanOrEqual(e1 @ NumericType(), e2 @ NumericType()) =>
+        (e1, e2).evaluateAs (BooleanType) { case (eval1, eval2) => q"$eval1 <= $eval2" }
+
+      case And(e1, e2) =>
+        val eval1 = expressionEvaluator(e1)
+        val eval2 = expressionEvaluator(e2)
+
+        eval1.code ++ eval2.code ++
+        q"""
+          var $nullTerm = false
+          var $primitiveTerm: ${termForType(BooleanType)} = false
+
+          if ((!${eval1.nullTerm} && !${eval1.primitiveTerm}) ||
+              (!${eval2.nullTerm} && !${eval2.primitiveTerm})) {
+            $nullTerm = false
+            $primitiveTerm = false
+          } else if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
+            $nullTerm = true
+          } else {
+            $nullTerm = false
+            $primitiveTerm = true
+          }
+         """.children
+
+      case Or(e1, e2) =>
+        val eval1 = expressionEvaluator(e1)
+        val eval2 = expressionEvaluator(e2)
+
+        eval1.code ++ eval2.code ++
+        q"""
+          var $nullTerm = false
+          var $primitiveTerm: ${termForType(BooleanType)} = false
+
+          if ((!${eval1.nullTerm} && ${eval1.primitiveTerm}) ||
+              (!${eval2.nullTerm} && ${eval2.primitiveTerm})) {
+            $nullTerm = false
+            $primitiveTerm = true
+          } else if (${eval1.nullTerm} || ${eval2.nullTerm} ) {
+            $nullTerm = true
+          } else {
+            $nullTerm = false
+            $primitiveTerm = false
+          }
+         """.children
+
+      case Not(child) =>
+        // Uh, bad function name...
+        child.castOrNull(c => q"!$c", BooleanType)
+
+      case Add(e1, e2) =>      (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 + $eval2" }
+      case Subtract(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 - $eval2" }
+      case Multiply(e1, e2) => (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 * $eval2" }
+      case Divide(e1, e2) =>   (e1, e2) evaluate { case (eval1, eval2) => q"$eval1 / $eval2" }
+
+      case IsNotNull(e) =>
+        val eval = expressionEvaluator(e)
+        q"""
+          ..${eval.code}
+          var $nullTerm = false
+          var $primitiveTerm: ${termForType(BooleanType)} = !${eval.nullTerm}
+        """.children
+
+      case IsNull(e) =>
+        val eval = expressionEvaluator(e)
+        q"""
+          ..${eval.code}
+          var $nullTerm = false
+          var $primitiveTerm: ${termForType(BooleanType)} = ${eval.nullTerm}
+        """.children
+
+      case c @ Coalesce(children) =>
+        q"""
+          var $nullTerm = true
+          var $primitiveTerm: ${termForType(c.dataType)} = ${defaultPrimitive(c.dataType)}
+        """.children ++
+        children.map { c =>
+          val eval = expressionEvaluator(c)
+          q"""
+            if($nullTerm) {
+              ..${eval.code}
+              if(!${eval.nullTerm}) {
+                $nullTerm = false
+                $primitiveTerm = ${eval.primitiveTerm}
+              }
+            }
+          """
+        }
+
+      case i @ expressions.If(condition, trueValue, falseValue) =>
+        val condEval = expressionEvaluator(condition)
+        val trueEval = expressionEvaluator(trueValue)
+        val falseEval = expressionEvaluator(falseValue)
+
+        q"""
+          var $nullTerm = false
+          var $primitiveTerm: ${termForType(i.dataType)} = ${defaultPrimitive(i.dataType)}
+          ..${condEval.code}
+          if(!${condEval.nullTerm} && ${condEval.primitiveTerm}) {
+            ..${trueEval.code}
+            $nullTerm = ${trueEval.nullTerm}
+            $primitiveTerm = ${trueEval.primitiveTerm}
+          } else {
+            ..${falseEval.code}
+            $nullTerm = ${falseEval.nullTerm}
+            $primitiveTerm = ${falseEval.primitiveTerm}
+          }
+        """.children
+    }
+
+    // If there was no match in the partial function above, we fall back on calling the interpreted
+    // expression evaluator.
+    val code: Seq[Tree] =
+      primitiveEvaluation.lift.apply(e).getOrElse {
+        log.debug(s"No rules to generate $e")
+        val tree = reify { e }
+        q"""
+          val $objectTerm = $tree.eval(i)
+          val $nullTerm = $objectTerm == null
+          val $primitiveTerm = $objectTerm.asInstanceOf[${termForType(e.dataType)}]
+         """.children
+      }
+
+    EvaluatedExpression(code, nullTerm, primitiveTerm, objectTerm)
+  }
+
+  protected def getColumn(inputRow: TermName, dataType: DataType, ordinal: Int) = {
+    dataType match {
+      case dt @ NativeType() => q"$inputRow.${accessorForType(dt)}($ordinal)"
+      case _ => q"$inputRow.apply($ordinal).asInstanceOf[${termForType(dataType)}]"
+    }
+  }
+
+  protected def setColumn(
+      destinationRow: TermName,
+      dataType: DataType,
+      ordinal: Int,
+      value: TermName) = {
+    dataType match {
+      case dt @ NativeType() => q"$destinationRow.${mutatorForType(dt)}($ordinal, $value)"
+      case _ => q"$destinationRow.update($ordinal, $value)"
+    }
+  }
+
+  protected def accessorForType(dt: DataType) = newTermName(s"get${primitiveForType(dt)}")
+  protected def mutatorForType(dt: DataType) = newTermName(s"set${primitiveForType(dt)}")
+
+  protected def primitiveForType(dt: DataType) = dt match {
+    case IntegerType => "Int"
+    case LongType => "Long"
+    case ShortType => "Short"
+    case ByteType => "Byte"
+    case DoubleType => "Double"
+    case FloatType => "Float"
+    case BooleanType => "Boolean"
+    case StringType => "String"
+  }
+
+  protected def defaultPrimitive(dt: DataType) = dt match {
+    case BooleanType => ru.Literal(Constant(false))
+    case FloatType => ru.Literal(Constant(-1.0.toFloat))
+    case StringType => ru.Literal(Constant("<uninit>"))
+    case ShortType => ru.Literal(Constant(-1.toShort))
+    case LongType => ru.Literal(Constant(1L))
+    case ByteType => ru.Literal(Constant(-1.toByte))
+    case DoubleType => ru.Literal(Constant(-1.toDouble))
+    case DecimalType => ru.Literal(Constant(-1)) // Will get implicity converted as needed.
+    case IntegerType => ru.Literal(Constant(-1))
+    case _ => ru.Literal(Constant(null))
+  }
+
+  protected def termForType(dt: DataType) = dt match {
+    case n: NativeType => n.tag
+    case _ => typeTag[Any]
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
new file mode 100644
index 0000000000000..a419fd7ecb39b
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.sql.catalyst.expressions._
+
+/**
+ * Generates byte code that produces a [[MutableRow]] object that can update itself based on a new
+ * input [[Row]] for a fixed set of [[Expression Expressions]].
+ */
+object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => MutableProjection] {
+  import scala.reflect.runtime.{universe => ru}
+  import scala.reflect.runtime.universe._
+
+  val mutableRowName = newTermName("mutableRow")
+
+  protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
+    in.map(ExpressionCanonicalizer(_))
+
+  protected def bind(in: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] =
+    in.map(BindReferences.bindReference(_, inputSchema))
+
+  protected def create(expressions: Seq[Expression]): (() => MutableProjection) = {
+    val projectionCode = expressions.zipWithIndex.flatMap { case (e, i) =>
+      val evaluationCode = expressionEvaluator(e)
+
+      evaluationCode.code :+
+      q"""
+        if(${evaluationCode.nullTerm})
+          mutableRow.setNullAt($i)
+        else
+          ${setColumn(mutableRowName, e.dataType, i, evaluationCode.primitiveTerm)}
+      """
+    }
+
+    val code =
+      q"""
+        () => { new $mutableProjectionType {
+
+          private[this] var $mutableRowName: $mutableRowType =
+            new $genericMutableRowType(${expressions.size})
+
+          def target(row: $mutableRowType): $mutableProjectionType = {
+            $mutableRowName = row
+            this
+          }
+
+          /* Provide immutable access to the last projected row. */
+          def currentValue: $rowType = mutableRow
+
+          def apply(i: $rowType): $rowType = {
+            ..$projectionCode
+            mutableRow
+          }
+        } }
+      """
+
+    log.debug(s"code for ${expressions.mkString(",")}:\n$code")
+    toolBox.eval(code).asInstanceOf[() => MutableProjection]
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
new file mode 100644
index 0000000000000..094ff14552283
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.types.{StringType, NumericType}
+
+/**
+ * Generates bytecode for an [[Ordering]] of [[Row Rows]] for a given set of
+ * [[Expression Expressions]].
+ */
+object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] with Logging {
+  import scala.reflect.runtime.{universe => ru}
+  import scala.reflect.runtime.universe._
+
+ protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
+    in.map(ExpressionCanonicalizer(_).asInstanceOf[SortOrder])
+
+  protected def bind(in: Seq[SortOrder], inputSchema: Seq[Attribute]): Seq[SortOrder] =
+    in.map(BindReferences.bindReference(_, inputSchema))
+
+  protected def create(ordering: Seq[SortOrder]): Ordering[Row] = {
+    val a = newTermName("a")
+    val b = newTermName("b")
+    val comparisons = ordering.zipWithIndex.map { case (order, i) =>
+      val evalA = expressionEvaluator(order.child)
+      val evalB = expressionEvaluator(order.child)
+
+      val compare = order.child.dataType match {
+        case _: NumericType =>
+          q"""
+          val comp = ${evalA.primitiveTerm} - ${evalB.primitiveTerm}
+          if(comp != 0) {
+            return ${if (order.direction == Ascending) q"comp.toInt" else q"-comp.toInt"}
+          }
+          """
+        case StringType =>
+          if (order.direction == Ascending) {
+            q"""return ${evalA.primitiveTerm}.compare(${evalB.primitiveTerm})"""
+          } else {
+            q"""return ${evalB.primitiveTerm}.compare(${evalA.primitiveTerm})"""
+          }
+      }
+
+      q"""
+        i = $a
+        ..${evalA.code}
+        i = $b
+        ..${evalB.code}
+        if (${evalA.nullTerm} && ${evalB.nullTerm}) {
+          // Nothing
+        } else if (${evalA.nullTerm}) {
+          return ${if (order.direction == Ascending) q"-1" else q"1"}
+        } else if (${evalB.nullTerm}) {
+          return ${if (order.direction == Ascending) q"1" else q"-1"}
+        } else {
+          $compare
+        }
+      """
+    }
+
+    val q"class $orderingName extends $orderingType { ..$body }" = reify {
+      class SpecificOrdering extends Ordering[Row] {
+        val o = ordering
+      }
+    }.tree.children.head
+
+    val code = q"""
+      class $orderingName extends $orderingType {
+        ..$body
+        def compare(a: $rowType, b: $rowType): Int = {
+          var i: $rowType = null // Holds current row being evaluated.
+          ..$comparisons
+          return 0
+        }
+      }
+      new $orderingName()
+      """
+    logDebug(s"Generated Ordering: $code")
+    toolBox.eval(code).asInstanceOf[Ordering[Row]]
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
new file mode 100644
index 0000000000000..2a0935c790cf3
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.sql.catalyst.expressions._
+
+/**
+ * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[Row]].
+ */
+object GeneratePredicate extends CodeGenerator[Expression, (Row) => Boolean] {
+  import scala.reflect.runtime.{universe => ru}
+  import scala.reflect.runtime.universe._
+
+  protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer(in)
+
+  protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
+    BindReferences.bindReference(in, inputSchema)
+
+  protected def create(predicate: Expression): ((Row) => Boolean) = {
+    val cEval = expressionEvaluator(predicate)
+
+    val code =
+      q"""
+        (i: $rowType) => {
+          ..${cEval.code}
+          if (${cEval.nullTerm}) false else ${cEval.primitiveTerm}
+        }
+      """
+
+    log.debug(s"Generated predicate '$predicate':\n$code")
+    toolBox.eval(code).asInstanceOf[Row => Boolean]
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
new file mode 100644
index 0000000000000..77fa02c13de30
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.types._
+
+
+/**
+ * Generates bytecode that produces a new [[Row]] object based on a fixed set of input
+ * [[Expression Expressions]] and a given input [[Row]].  The returned [[Row]] object is custom
+ * generated based on the output types of the [[Expression]] to avoid boxing of primitive values.
+ */
+object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
+  import scala.reflect.runtime.{universe => ru}
+  import scala.reflect.runtime.universe._
+
+  protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
+    in.map(ExpressionCanonicalizer(_))
+
+  protected def bind(in: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] =
+    in.map(BindReferences.bindReference(_, inputSchema))
+
+  // Make Mutablility optional...
+  protected def create(expressions: Seq[Expression]): Projection = {
+    val tupleLength = ru.Literal(Constant(expressions.length))
+    val lengthDef = q"final val length = $tupleLength"
+
+    /* TODO: Configurable...
+    val nullFunctions =
+      q"""
+        private final val nullSet = new org.apache.spark.util.collection.BitSet(length)
+        final def setNullAt(i: Int) = nullSet.set(i)
+        final def isNullAt(i: Int) = nullSet.get(i)
+      """
+     */
+
+    val nullFunctions =
+      q"""
+        private[this] var nullBits = new Array[Boolean](${expressions.size})
+        final def setNullAt(i: Int) = { nullBits(i) = true }
+        final def isNullAt(i: Int) = nullBits(i)
+      """.children
+
+    val tupleElements = expressions.zipWithIndex.flatMap {
+      case (e, i) =>
+        val elementName = newTermName(s"c$i")
+        val evaluatedExpression = expressionEvaluator(e)
+        val iLit = ru.Literal(Constant(i))
+
+        q"""
+        var ${newTermName(s"c$i")}: ${termForType(e.dataType)} = _
+        {
+          ..${evaluatedExpression.code}
+          if(${evaluatedExpression.nullTerm})
+            setNullAt($iLit)
+          else
+            $elementName = ${evaluatedExpression.primitiveTerm}
+        }
+        """.children : Seq[Tree]
+    }
+
+    val iteratorFunction = {
+      val allColumns = (0 until expressions.size).map { i =>
+        val iLit = ru.Literal(Constant(i))
+        q"if(isNullAt($iLit)) { null } else { ${newTermName(s"c$i")} }"
+      }
+      q"final def iterator = Iterator[Any](..$allColumns)"
+    }
+
+    val accessorFailure = q"""scala.sys.error("Invalid ordinal:" + i)"""
+    val applyFunction = {
+      val cases = (0 until expressions.size).map { i =>
+        val ordinal = ru.Literal(Constant(i))
+        val elementName = newTermName(s"c$i")
+        val iLit = ru.Literal(Constant(i))
+
+        q"if(i == $ordinal) { if(isNullAt($i)) return null else return $elementName }"
+      }
+      q"final def apply(i: Int): Any = { ..$cases; $accessorFailure }"
+    }
+
+    val updateFunction = {
+      val cases = expressions.zipWithIndex.map {case (e, i) =>
+        val ordinal = ru.Literal(Constant(i))
+        val elementName = newTermName(s"c$i")
+        val iLit = ru.Literal(Constant(i))
+
+        q"""
+          if(i == $ordinal) {
+            if(value == null) {
+              setNullAt(i)
+            } else {
+              $elementName = value.asInstanceOf[${termForType(e.dataType)}]
+              return
+            }
+          }"""
+      }
+      q"final def update(i: Int, value: Any): Unit = { ..$cases; $accessorFailure }"
+    }
+
+    val specificAccessorFunctions = NativeType.all.map { dataType =>
+      val ifStatements = expressions.zipWithIndex.flatMap {
+        case (e, i) if e.dataType == dataType =>
+          val elementName = newTermName(s"c$i")
+          // TODO: The string of ifs gets pretty inefficient as the row grows in size.
+          // TODO: Optional null checks?
+          q"if(i == $i) return $elementName" :: Nil
+        case _ => Nil
+      }
+
+      q"""
+      final def ${accessorForType(dataType)}(i: Int):${termForType(dataType)} = {
+        ..$ifStatements;
+        $accessorFailure
+      }"""
+    }
+
+    val specificMutatorFunctions = NativeType.all.map { dataType =>
+      val ifStatements = expressions.zipWithIndex.flatMap {
+        case (e, i) if e.dataType == dataType =>
+          val elementName = newTermName(s"c$i")
+          // TODO: The string of ifs gets pretty inefficient as the row grows in size.
+          // TODO: Optional null checks?
+          q"if(i == $i) { $elementName = value; return }" :: Nil
+        case _ => Nil
+      }
+
+      q"""
+      final def ${mutatorForType(dataType)}(i: Int, value: ${termForType(dataType)}): Unit = {
+        ..$ifStatements;
+        $accessorFailure
+      }"""
+    }
+
+    val hashValues = expressions.zipWithIndex.map { case (e,i) =>
+      val elementName = newTermName(s"c$i")
+      val nonNull = e.dataType match {
+        case BooleanType => q"if ($elementName) 0 else 1"
+        case ByteType | ShortType | IntegerType => q"$elementName.toInt"
+        case LongType => q"($elementName ^ ($elementName >>> 32)).toInt"
+        case FloatType => q"java.lang.Float.floatToIntBits($elementName)"
+        case DoubleType =>
+          q"{ val b = java.lang.Double.doubleToLongBits($elementName); (b ^ (b >>>32)).toInt }"
+        case _ => q"$elementName.hashCode"
+      }
+      q"if (isNullAt($i)) 0 else $nonNull"
+    }
+
+    val hashUpdates: Seq[Tree] = hashValues.map(v => q"""result = 37 * result + $v""": Tree)
+
+    val hashCodeFunction =
+      q"""
+        override def hashCode(): Int = {
+          var result: Int = 37
+          ..$hashUpdates
+          result
+        }
+      """
+
+    val columnChecks = (0 until expressions.size).map { i =>
+      val elementName = newTermName(s"c$i")
+      q"if (this.$elementName != specificType.$elementName) return false"
+    }
+
+    val equalsFunction =
+      q"""
+        override def equals(other: Any): Boolean = other match {
+          case specificType: SpecificRow =>
+            ..$columnChecks
+            return true
+          case other => super.equals(other)
+        }
+      """
+
+    val copyFunction =
+      q"""
+        final def copy() = new $genericRowType(this.toArray)
+      """
+
+    val classBody =
+      nullFunctions ++ (
+        lengthDef +:
+        iteratorFunction +:
+        applyFunction +:
+        updateFunction +:
+        equalsFunction +:
+        hashCodeFunction +:
+        copyFunction +:
+        (tupleElements ++ specificAccessorFunctions ++ specificMutatorFunctions))
+
+    val code = q"""
+      final class SpecificRow(i: $rowType) extends $mutableRowType {
+        ..$classBody
+      }
+
+      new $projectionType { def apply(r: $rowType) = new SpecificRow(r) }
+    """
+
+    log.debug(
+      s"MutableRow, initExprs: ${expressions.mkString(",")} code:\n${toolBox.typeCheck(code)}")
+    toolBox.eval(code).asInstanceOf[Projection]
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
new file mode 100644
index 0000000000000..80c7dfd376c96
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.rules
+import org.apache.spark.sql.catalyst.util
+
+/**
+ * A collection of generators that build custom bytecode at runtime for performing the evaluation
+ * of catalyst expression.
+ */
+package object codegen {
+
+  /**
+   * A lock to protect invoking the scala compiler at runtime, since it is not thread safe in Scala
+   * 2.10.
+   */
+  protected[codegen] val globalLock = org.apache.spark.sql.catalyst.ScalaReflectionLock
+
+  /** Canonicalizes an expression so those that differ only by names can reuse the same code. */
+  object ExpressionCanonicalizer extends rules.RuleExecutor[Expression] {
+    val batches =
+      Batch("CleanExpressions", FixedPoint(20), CleanExpressions) :: Nil
+
+    object CleanExpressions extends rules.Rule[Expression] {
+      def apply(e: Expression): Expression = e transform {
+        case Alias(c, _) => c
+      }
+    }
+  }
+
+  /**
+   * :: DeveloperApi ::
+   * Dumps the bytecode from a class to the screen using javap.
+   */
+  @DeveloperApi
+  object DumpByteCode {
+    import scala.sys.process._
+    val dumpDirectory = util.getTempFilePath("sparkSqlByteCode")
+    dumpDirectory.mkdir()
+
+    def apply(obj: Any): Unit = {
+      val generatedClass = obj.getClass
+      val classLoader =
+        generatedClass
+          .getClassLoader
+          .asInstanceOf[scala.tools.nsc.interpreter.AbstractFileClassLoader]
+      val generatedBytes = classLoader.classBytes(generatedClass.getName)
+
+      val packageDir = new java.io.File(dumpDirectory, generatedClass.getPackage.getName)
+      if (!packageDir.exists()) { packageDir.mkdir() }
+
+      val classFile =
+        new java.io.File(packageDir, generatedClass.getName.split("\\.").last + ".class")
+
+      val outfile = new java.io.FileOutputStream(classFile)
+      outfile.write(generatedBytes)
+      outfile.close()
+
+      println(
+        s"javap -p -v -classpath ${dumpDirectory.getCanonicalPath} ${generatedClass.getName}".!!)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 5d3bb25ad568c..c1154eb81c319 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.collection.Map
+
 import org.apache.spark.sql.catalyst.types._
 
 /**
@@ -31,8 +33,8 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression {
   override def foldable = child.foldable && ordinal.foldable
   override def references = children.flatMap(_.references).toSet
   def dataType = child.dataType match {
-    case ArrayType(dt) => dt
-    case MapType(_, vt) => vt
+    case ArrayType(dt, _) => dt
+    case MapType(_, vt, _) => vt
   }
   override lazy val resolved =
     childrenResolved &&
@@ -61,7 +63,6 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression {
           }
         } else {
           val baseValue = value.asInstanceOf[Map[Any, _]]
-          val key = ordinal.eval(input)
           baseValue.get(key).orNull
         }
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index dd78614754e12..3d41acb79e5fd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.collection.Map
+
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.types._
 
@@ -84,8 +86,8 @@ case class Explode(attributeNames: Seq[String], child: Expression)
     (child.dataType.isInstanceOf[ArrayType] || child.dataType.isInstanceOf[MapType])
 
   private lazy val elementTypes = child.dataType match {
-    case ArrayType(et) => et :: Nil
-    case MapType(kt,vt) => kt :: vt :: Nil
+    case ArrayType(et, _) => et :: Nil
+    case MapType(kt,vt, _) => kt :: vt :: Nil
   }
 
   // TODO: Move this pattern into Generator.
@@ -102,10 +104,10 @@ case class Explode(attributeNames: Seq[String], child: Expression)
 
   override def eval(input: Row): TraversableOnce[Row] = {
     child.dataType match {
-      case ArrayType(_) =>
+      case ArrayType(_, _) =>
         val inputArray = child.eval(input).asInstanceOf[Seq[Any]]
         if (inputArray == null) Nil else inputArray.map(v => new GenericRow(Array(v)))
-      case MapType(_, _) =>
+      case MapType(_, _, _) =>
         val inputMap = child.eval(input).asInstanceOf[Map[Any,Any]]
         if (inputMap == null) Nil else inputMap.map { case (k,v) => new GenericRow(Array(k,v)) }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 934bad8c27294..02d04762629f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -28,8 +28,8 @@ object NamedExpression {
 }
 
 /**
- * A globally (within this JVM) id for a given named expression.
- * Used to identify with attribute output by a relation is being
+ * A globally unique (within this JVM) id for a given named expression.
+ * Used to identify which attribute output by a relation is being
  * referenced in a subsequent computation.
  */
 case class ExprId(id: Long)
@@ -134,7 +134,7 @@ case class AttributeReference(name: String, dataType: DataType, nullable: Boolea
   /**
    * Returns a copy of this [[AttributeReference]] with changed nullability.
    */
-  override def withNullability(newNullability: Boolean) = {
+  override def withNullability(newNullability: Boolean): AttributeReference = {
     if (nullable == newNullability) {
       this
     } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index b6f2451b52e1f..55d95991c5f11 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -47,4 +47,30 @@ package org.apache.spark.sql.catalyst
  * ==Evaluation==
  * The result of expressions can be evaluated using the `Expression.apply(Row)` method.
  */
-package object expressions
+package object expressions  {
+
+  /**
+   * Converts a [[Row]] to another Row given a sequence of expression that define each column of the
+   * new row. If the schema of the input row is specified, then the given expression will be bound
+   * to that schema.
+   */
+  abstract class Projection extends (Row => Row)
+
+  /**
+   * Converts a [[Row]] to another Row given a sequence of expression that define each column of the
+   * new row. If the schema of the input row is specified, then the given expression will be bound
+   * to that schema.
+   *
+   * In contrast to a normal projection, a MutableProjection reuses the same underlying row object
+   * each time an input row is added.  This significantly reduces the cost of calculating the
+   * projection, but means that it is not safe to hold on to a reference to a [[Row]] after `next()`
+   * has been called on the [[Iterator]] that produced it. Instead, the user must call `Row.copy()`
+   * and hold on to the returned [[Row]] before calling `next()`.
+   */
+  abstract class MutableProjection extends Projection {
+    def currentValue: Row
+
+    /** Uses the given row to store the output of the projection. */
+    def target(row: MutableRow): MutableProjection
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index b63406b94a4a3..5976b0ddf3e03 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -23,6 +23,9 @@ import org.apache.spark.sql.catalyst.types.BooleanType
 
 
 object InterpretedPredicate {
+  def apply(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
+    apply(BindReferences.bindReference(expression, inputSchema))
+
   def apply(expression: Expression): (Row => Boolean) = {
     (r: Row) => expression.eval(r).asInstanceOf[Boolean]
   }
@@ -153,6 +156,22 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
   }
 }
 
+case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
+  def symbol = "<=>"
+  override def nullable = false
+  override def eval(input: Row): Any = {
+    val l = left.eval(input)
+    val r = right.eval(input)
+    if (l == null && r == null) {
+      true
+    } else if (l == null || r == null) {
+      false
+    } else {
+      l == r
+    }
+  }
+}
+
 case class LessThan(left: Expression, right: Expression) extends BinaryComparison {
   def symbol = "<"
   override def eval(input: Row): Any = c2(input, left, right, _.lt(_, _))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index c074b7bb01e57..97fc3a3b14b88 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.util.regex.Pattern
 
-import org.apache.spark.sql.catalyst.types.DataType
-import org.apache.spark.sql.catalyst.types.StringType
-import org.apache.spark.sql.catalyst.types.BooleanType
+import scala.collection.IndexedSeqOptimized
 
 
+import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}
+
 trait StringRegexExpression {
   self: BinaryExpression =>
 
@@ -32,7 +33,7 @@ trait StringRegexExpression {
   def escape(v: String): String
   def matches(regex: Pattern, str: String): Boolean
 
-  def nullable: Boolean = true
+  def nullable: Boolean = left.nullable || right.nullable
   def dataType: DataType = BooleanType
 
   // try cache the pattern for Literal
@@ -156,3 +157,124 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE
 
   override def toString() = s"Lower($child)"
 }
+
+/** A base trait for functions that compare two strings, returning a boolean. */
+trait StringComparison {
+  self: BinaryExpression =>
+
+  type EvaluatedType = Any
+
+  def nullable: Boolean = left.nullable || right.nullable
+  override def dataType: DataType = BooleanType
+
+  def compare(l: String, r: String): Boolean
+
+  override def eval(input: Row): Any = {
+    val leftEval = left.eval(input).asInstanceOf[String]
+    if(leftEval == null) {
+      null
+    } else {
+      val rightEval = right.eval(input).asInstanceOf[String]
+      if (rightEval == null) null else compare(leftEval, rightEval)
+    }
+  }
+
+  def symbol: String = nodeName
+
+  override def toString() = s"$nodeName($left, $right)"
+}
+
+/**
+ * A function that returns true if the string `left` contains the string `right`.
+ */
+case class Contains(left: Expression, right: Expression)
+    extends BinaryExpression with StringComparison {
+  override def compare(l: String, r: String) = l.contains(r)
+}
+
+/**
+ * A function that returns true if the string `left` starts with the string `right`.
+ */
+case class StartsWith(left: Expression, right: Expression)
+    extends BinaryExpression with StringComparison {
+  def compare(l: String, r: String) = l.startsWith(r)
+}
+
+/**
+ * A function that returns true if the string `left` ends with the string `right`.
+ */
+case class EndsWith(left: Expression, right: Expression)
+    extends BinaryExpression with StringComparison {
+  def compare(l: String, r: String) = l.endsWith(r)
+}
+
+/**
+ * A function that takes a substring of its first argument starting at a given position.
+ * Defined for String and Binary types.
+ */
+case class Substring(str: Expression, pos: Expression, len: Expression) extends Expression {
+  
+  type EvaluatedType = Any
+
+  override def foldable = str.foldable && pos.foldable && len.foldable
+
+  def nullable: Boolean = str.nullable || pos.nullable || len.nullable
+  def dataType: DataType = {
+    if (!resolved) {
+      throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
+    }
+    if (str.dataType == BinaryType) str.dataType else StringType
+  }
+
+  def references = children.flatMap(_.references).toSet
+
+  override def children = str :: pos :: len :: Nil
+
+  @inline
+  def slice[T, C <: Any](str: C, startPos: Int, sliceLen: Int)
+      (implicit ev: (C=>IndexedSeqOptimized[T,_])): Any = {
+    val len = str.length
+    // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
+    // negative indices for start positions. If a start index i is greater than 0, it 
+    // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
+    // to the -ith element before the end of the sequence. If a start index i is 0, it
+    // refers to the first element.
+
+    val start = startPos match {
+      case pos if pos > 0 => pos - 1
+      case neg if neg < 0 => len + neg
+      case _ => 0
+    }
+
+    val end = sliceLen match {
+      case max if max == Integer.MAX_VALUE => max
+      case x => start + x
+    }
+
+    str.slice(start, end)    
+  }
+
+  override def eval(input: Row): Any = {
+    val string = str.eval(input)
+
+    val po = pos.eval(input)
+    val ln = len.eval(input)
+
+    if ((string == null) || (po == null) || (ln == null)) {
+      null
+    } else {
+      val start = po.asInstanceOf[Int]
+      val length = ln.asInstanceOf[Int] 
+
+      string match {
+        case ba: Array[Byte] => slice(ba, start, length)
+        case other => slice(other.toString, start, length)
+      }
+    }
+  }
+
+  override def toString = len match {
+    case max if max == Integer.MAX_VALUE => s"SUBSTR($str, $pos)"
+    case _ => s"SUBSTR($str, $pos, $len)"
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 48ca31e70fe7c..5f86d6047cb9c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] {
     Batch("ConstantFolding", FixedPoint(100),
       NullPropagation,
       ConstantFolding,
+      LikeSimplification,
       BooleanSimplification,
       SimplifyFilters,
       SimplifyCasts,
@@ -111,6 +112,31 @@ object ColumnPruning extends Rule[LogicalPlan] {
     }
 }
 
+/**
+ * Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition.
+ * For example, when the expression is just checking to see if a string starts with a given
+ * pattern.
+ */
+object LikeSimplification extends Rule[LogicalPlan] {
+  // if guards below protect from escapes on trailing %.
+  // Cases like "something\%" are not optimized, but this does not affect correctness.
+  val startsWith = "([^_%]+)%".r
+  val endsWith = "%([^_%]+)".r
+  val contains = "%([^_%]+)%".r
+  val equalTo = "([^_%]*)".r
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") =>
+      StartsWith(l, Literal(pattern))
+    case Like(l, Literal(endsWith(pattern), StringType)) =>
+      EndsWith(l, Literal(pattern))
+    case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") =>
+      Contains(l, Literal(pattern))
+    case Like(l, Literal(equalTo(pattern), StringType)) =>
+      EqualTo(l, Literal(pattern))
+  }
+}
+
 /**
  * Replaces [[Expression Expressions]] that can be statically evaluated with
  * equivalent [[Literal]] values. This rule is more specific with
@@ -127,11 +153,15 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ GetItem(Literal(null, _), _) => Literal(null, e.dataType)
       case e @ GetItem(_, Literal(null, _)) => Literal(null, e.dataType)
       case e @ GetField(Literal(null, _), _) => Literal(null, e.dataType)
-      case e @ Coalesce(children) => {
-        val newChildren = children.filter(c => c match {
+      case e @ EqualNullSafe(Literal(null, _), r) => IsNull(r)
+      case e @ EqualNullSafe(l, Literal(null, _)) => IsNull(l)
+
+      // For Coalesce, remove null literals.
+      case e @ Coalesce(children) =>
+        val newChildren = children.filter {
           case Literal(null, _) => false
           case _ => true
-        })
+        }
         if (newChildren.length == 0) {
           Literal(null, e.dataType)
         } else if (newChildren.length == 1) {
@@ -139,12 +169,11 @@ object NullPropagation extends Rule[LogicalPlan] {
         } else {
           Coalesce(newChildren)
         }
-      }
-      case e @ If(Literal(v, _), trueValue, falseValue) => if (v == true) trueValue else falseValue
-      case e @ In(Literal(v, _), list) if (list.exists(c => c match {
-          case Literal(candidate, _) if candidate == v => true
-          case _ => false
-        })) => Literal(true, BooleanType)
+
+      case e @ Substring(Literal(null, _), _, _) => Literal(null, e.dataType)
+      case e @ Substring(_, Literal(null, _), _) => Literal(null, e.dataType)
+      case e @ Substring(_, _, Literal(null, _)) => Literal(null, e.dataType)
+
       // Put exceptional cases above if any
       case e: BinaryArithmetic => e.children match {
         case Literal(null, _) :: right :: Nil => Literal(null, e.dataType)
@@ -161,6 +190,11 @@ object NullPropagation extends Rule[LogicalPlan] {
         case left :: Literal(null, _) :: Nil => Literal(null, e.dataType)
         case _ => e
       }
+      case e: StringComparison => e.children match {
+        case Literal(null, _) :: right :: Nil => Literal(null, e.dataType)
+        case left :: Literal(null, _) :: Nil => Literal(null, e.dataType)
+        case _ => e
+      }
     }
   }
 }
@@ -172,9 +206,19 @@ object NullPropagation extends Rule[LogicalPlan] {
 object ConstantFolding extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsDown {
-      // Skip redundant folding of literals.
+      // Skip redundant folding of literals. This rule is technically not necessary. Placing this
+      // here avoids running the next rule for Literal values, which would create a new Literal
+      // object and running eval unnecessarily.
       case l: Literal => l
+
+      // Fold expressions that are foldable.
       case e if e.foldable => Literal(e.eval(null), e.dataType)
+
+      // Fold "literal in (item1, item2, ..., literal, ...)" into true directly.
+      case In(Literal(v, _), list) if list.exists {
+          case Literal(candidate, _) if candidate == v => true
+          case _ => false
+        } => Literal(true, BooleanType)
     }
   }
 }
@@ -204,6 +248,9 @@ object BooleanSimplification extends Rule[LogicalPlan] {
           case (l, Literal(false, BooleanType)) => l
           case (_, _) => or
         }
+
+      // Turn "if (true) a else b" into "a", and if (false) a else b" into "b".
+      case e @ If(Literal(v, _), trueValue, falseValue) => if (v == true) trueValue else falseValue
     }
   }
 }
@@ -225,12 +272,12 @@ object CombineFilters extends Rule[LogicalPlan] {
  */
 object SimplifyFilters extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case Filter(Literal(true, BooleanType), child) =>
-      child
-    case Filter(Literal(null, _), child) =>
-      LocalRelation(child.output)
-    case Filter(Literal(false, BooleanType), child) =>
-      LocalRelation(child.output)
+    // If the filter condition always evaluate to true, remove the filter.
+    case Filter(Literal(true, BooleanType), child) => child
+    // If the filter condition always evaluate to null or false,
+    // replace the input with an empty relation.
+    case Filter(Literal(null, _), child) => LocalRelation(child.output, data = Seq.empty)
+    case Filter(Literal(false, BooleanType), child) => LocalRelation(child.output, data = Seq.empty)
   }
 }
 
@@ -272,7 +319,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
   /**
    * Splits join condition expressions into three categories based on the attributes required
    * to evaluate them.
-   * @returns (canEvaluateInLeft, canEvaluateInRight, haveToEvaluateInBoth)
+   * @return (canEvaluateInLeft, canEvaluateInRight, haveToEvaluateInBoth)
    */
   private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = {
     val (leftEvaluateCondition, rest) =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
new file mode 100644
index 0000000000000..bdd07bbeb2230
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+package object catalyst {
+  /**
+   * A JVM-global lock that should be used to prevent thread safety issues when using things in
+   * scala.reflect.*.  Note that Scala Reflection API is made thread-safe in 2.11, but not yet for
+   * 2.10.* builds.  See SI-6240 for more details.
+   */
+  protected[catalyst] object ScalaReflectionLock
+
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
index 67833664b35ae..5839c9f7c43ef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.planning
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 026692abe067d..90923fe31a063 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -19,9 +19,8 @@ package org.apache.spark.sql.catalyst.planning
 
 import scala.annotation.tailrec
 
-import org.apache.spark.sql.Logging
-
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 
@@ -104,6 +103,77 @@ object PhysicalOperation extends PredicateHelper {
   }
 }
 
+/**
+ * Matches a logical aggregation that can be performed on distributed data in two steps.  The first
+ * operates on the data in each partition performing partial aggregation for each group.  The second
+ * occurs after the shuffle and completes the aggregation.
+ *
+ * This pattern will only match if all aggregate expressions can be computed partially and will
+ * return the rewritten aggregation expressions for both phases.
+ *
+ * The returned values for this match are as follows:
+ *  - Grouping attributes for the final aggregation.
+ *  - Aggregates for the final aggregation.
+ *  - Grouping expressions for the partial aggregation.
+ *  - Partial aggregate expressions.
+ *  - Input to the aggregation.
+ */
+object PartialAggregation {
+  type ReturnType =
+    (Seq[Attribute], Seq[NamedExpression], Seq[Expression], Seq[NamedExpression], LogicalPlan)
+
+  def unapply(plan: LogicalPlan): Option[ReturnType] = plan match {
+    case logical.Aggregate(groupingExpressions, aggregateExpressions, child) =>
+      // Collect all aggregate expressions.
+      val allAggregates =
+        aggregateExpressions.flatMap(_ collect { case a: AggregateExpression => a})
+      // Collect all aggregate expressions that can be computed partially.
+      val partialAggregates =
+        aggregateExpressions.flatMap(_ collect { case p: PartialAggregate => p})
+
+      // Only do partial aggregation if supported by all aggregate expressions.
+      if (allAggregates.size == partialAggregates.size) {
+        // Create a map of expressions to their partial evaluations for all aggregate expressions.
+        val partialEvaluations: Map[Long, SplitEvaluation] =
+          partialAggregates.map(a => (a.id, a.asPartial)).toMap
+
+        // We need to pass all grouping expressions though so the grouping can happen a second
+        // time. However some of them might be unnamed so we alias them allowing them to be
+        // referenced in the second aggregation.
+        val namedGroupingExpressions: Map[Expression, NamedExpression] = groupingExpressions.map {
+          case n: NamedExpression => (n, n)
+          case other => (other, Alias(other, "PartialGroup")())
+        }.toMap
+
+        // Replace aggregations with a new expression that computes the result from the already
+        // computed partial evaluations and grouping values.
+        val rewrittenAggregateExpressions = aggregateExpressions.map(_.transformUp {
+          case e: Expression if partialEvaluations.contains(e.id) =>
+            partialEvaluations(e.id).finalEvaluation
+          case e: Expression if namedGroupingExpressions.contains(e) =>
+            namedGroupingExpressions(e).toAttribute
+        }).asInstanceOf[Seq[NamedExpression]]
+
+        val partialComputation =
+          (namedGroupingExpressions.values ++
+            partialEvaluations.values.flatMap(_.partialEvaluations)).toSeq
+
+        val namedGroupingAttributes = namedGroupingExpressions.values.map(_.toAttribute).toSeq
+
+        Some(
+          (namedGroupingAttributes,
+           rewrittenAggregateExpressions,
+           groupingExpressions,
+           partialComputation,
+           child))
+      } else {
+        None
+      }
+    case _ => None
+  }
+}
+
+
 /**
  * A pattern that finds joins with equality conditions that can be evaluated using equi-join.
  */
@@ -114,7 +184,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
 
   def unapply(plan: LogicalPlan): Option[ReturnType] = plan match {
     case join @ Join(left, right, joinType, condition) =>
-      logger.debug(s"Considering join on: $condition")
+      logDebug(s"Considering join on: $condition")
       // Find equi-join predicates that can be evaluated before the join, and thus can be used
       // as join keys.
       val (joinPredicates, otherPredicates) = 
@@ -132,7 +202,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       val rightKeys = joinKeys.map(_._2)
 
       if (joinKeys.nonEmpty) {
-        logger.debug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}")
+        logDebug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}")
         Some((joinType, leftKeys, rightKeys, otherPredicates.reduceOption(And), left, right))
       } else {
         None
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 7b82e19b2e714..0988b0c6d990c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -125,51 +125,10 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
     }.toSeq
   }
 
-  protected def generateSchemaString(schema: Seq[Attribute]): String = {
-    val builder = new StringBuilder
-    builder.append("root\n")
-    val prefix = " |"
-    schema.foreach { attribute =>
-      val name = attribute.name
-      val dataType = attribute.dataType
-      dataType match {
-        case fields: StructType =>
-          builder.append(s"$prefix-- $name: $StructType\n")
-          generateSchemaString(fields, s"$prefix    |", builder)
-        case ArrayType(fields: StructType) =>
-          builder.append(s"$prefix-- $name: $ArrayType[$StructType]\n")
-          generateSchemaString(fields, s"$prefix    |", builder)
-        case ArrayType(elementType: DataType) =>
-          builder.append(s"$prefix-- $name: $ArrayType[$elementType]\n")
-        case _ => builder.append(s"$prefix-- $name: $dataType\n")
-      }
-    }
-
-    builder.toString()
-  }
-
-  protected def generateSchemaString(
-      schema: StructType,
-      prefix: String,
-      builder: StringBuilder): StringBuilder = {
-    schema.fields.foreach {
-      case StructField(name, fields: StructType, _) =>
-        builder.append(s"$prefix-- $name: $StructType\n")
-        generateSchemaString(fields, s"$prefix    |", builder)
-      case StructField(name, ArrayType(fields: StructType), _) =>
-        builder.append(s"$prefix-- $name: $ArrayType[$StructType]\n")
-        generateSchemaString(fields, s"$prefix    |", builder)
-      case StructField(name, ArrayType(elementType: DataType), _) =>
-        builder.append(s"$prefix-- $name: $ArrayType[$elementType]\n")
-      case StructField(name, fieldType: DataType, _) =>
-        builder.append(s"$prefix-- $name: $fieldType\n")
-    }
-
-    builder
-  }
+  def schema: StructType = StructType.fromAttributes(output)
 
   /** Returns the output schema in the tree format. */
-  def schemaString: String = generateSchemaString(output)
+  def schemaString: String = schema.treeString
 
   /** Prints out the schema in the tree format */
   def printSchema(): Unit = println(schemaString)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index edc37e3877c0e..888cb08e95f06 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -26,6 +26,25 @@ import org.apache.spark.sql.catalyst.trees
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
   self: Product =>
 
+  /**
+   * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
+   * corresponding statistic produced by the children.  To override this behavior, override
+   * `statistics` and assign it an overriden version of `Statistics`.
+   *
+   * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
+   * performance of the implementations.  The reason is that estimations might get triggered in
+   * performance-critical processes, such as query plan planning.
+   *
+   * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
+   *                    defaults to the product of children's `sizeInBytes`.
+   */
+  case class Statistics(
+    sizeInBytes: BigInt
+  )
+  lazy val statistics: Statistics = Statistics(
+    sizeInBytes = children.map(_.statistics).map(_.sizeInBytes).product
+  )
+
   /**
    * Returns the set of attributes that are referenced by this node
    * during evaluation.
@@ -92,6 +111,9 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
 abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] {
   self: Product =>
 
+  override lazy val statistics: Statistics =
+    throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
+
   // Leaf nodes by definition cannot reference any input attributes.
   override def references = Set.empty
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 0728fa73fb108..3cb407217c4c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -177,7 +177,7 @@ case class LowerCaseSchema(child: LogicalPlan) extends UnaryNode {
     case StructType(fields) =>
       StructType(fields.map(f =>
         StructField(f.name.toLowerCase(), lowerCaseSchema(f.dataType), f.nullable)))
-    case ArrayType(elemType) => ArrayType(lowerCaseSchema(elemType))
+    case ArrayType(elemType, containsNull) => ArrayType(lowerCaseSchema(elemType), containsNull)
     case otherType => otherType
   }
 
@@ -210,3 +210,8 @@ case class Distinct(child: LogicalPlan) extends UnaryNode {
 case object NoRelation extends LeafNode {
   override def output = Nil
 }
+
+case class Intersect(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
+  override def output = left.output
+  override def references = Set.empty
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
index 1d5f033f0d274..481a5a4f212b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
@@ -35,7 +35,7 @@ abstract class Command extends LeafNode {
  */
 case class NativeCommand(cmd: String) extends Command {
   override def output =
-    Seq(BoundReference(0, AttributeReference("result", StringType, nullable = false)()))
+    Seq(AttributeReference("result", StringType, nullable = false)())
 }
 
 /**
@@ -43,8 +43,7 @@ case class NativeCommand(cmd: String) extends Command {
  */
 case class SetCommand(key: Option[String], value: Option[String]) extends Command {
   override def output = Seq(
-    BoundReference(0, AttributeReference("key", StringType, nullable = false)()),
-    BoundReference(1, AttributeReference("value", StringType, nullable = false)()))
+    AttributeReference("", StringType, nullable = false)())
 }
 
 /**
@@ -53,7 +52,7 @@ case class SetCommand(key: Option[String], value: Option[String]) extends Comman
  */
 case class ExplainCommand(plan: LogicalPlan) extends Command {
   override def output =
-    Seq(BoundReference(0, AttributeReference("plan", StringType, nullable = false)()))
+    Seq(AttributeReference("plan", StringType, nullable = false)())
 }
 
 /**
@@ -72,7 +71,7 @@ case class DescribeCommand(
     isExtended: Boolean) extends Command {
   override def output = Seq(
     // Column names are based on Hive.
-    BoundReference(0, AttributeReference("col_name", StringType, nullable = false)()),
-    BoundReference(1, AttributeReference("data_type", StringType, nullable = false)()),
-    BoundReference(2, AttributeReference("comment", StringType, nullable = false)()))
+    AttributeReference("col_name", StringType, nullable = false)(),
+    AttributeReference("data_type", StringType, nullable = false)(),
+    AttributeReference("comment", StringType, nullable = false)())
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
index 1076537bc7602..03414b2301e81 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.rules
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 abstract class Rule[TreeType <: TreeNode[_]] extends Logging {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index e32adb76fe146..d192b151ac1c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -15,10 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
-package catalyst
-package rules
+package org.apache.spark.sql.catalyst.rules
 
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.sideBySide
 
@@ -61,7 +60,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
           case (plan, rule) =>
             val result = rule(plan)
             if (!result.fastEquals(plan)) {
-              logger.trace(
+              logTrace(
                 s"""
                   |=== Applying Rule ${rule.ruleName} ===
                   |${sideBySide(plan.treeString, result.treeString).mkString("\n")}
@@ -72,25 +71,28 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
         }
         iteration += 1
         if (iteration > batch.strategy.maxIterations) {
-          logger.info(s"Max iterations ($iteration) reached for batch ${batch.name}")
+          // Only log if this is a rule that is supposed to run more than once.
+          if (iteration != 2) {
+            logInfo(s"Max iterations (${iteration - 1}) reached for batch ${batch.name}")
+          }
           continue = false
         }
 
         if (curPlan.fastEquals(lastPlan)) {
-          logger.trace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.")
+          logTrace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.")
           continue = false
         }
         lastPlan = curPlan
       }
 
       if (!batchStartPlan.fastEquals(curPlan)) {
-        logger.debug(
+        logDebug(
           s"""
           |=== Result of Batch ${batch.name} ===
           |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
         """.stripMargin)
       } else {
-        logger.trace(s"Batch ${batch.name} has no effect.")
+        logTrace(s"Batch ${batch.name} has no effect.")
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala
index d159ecdd5d781..d725a92c06f7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst
 
-import org.apache.spark.sql.Logger
+import org.apache.spark.Logging
 
 /**
  * A library for easily manipulating trees of operators.  Operators that extend TreeNode are
@@ -33,7 +33,8 @@ import org.apache.spark.sql.Logger
  *   <li>debugging support - pretty printing, easy splicing of trees, etc.</li>
  * </ul>
  */
-package object trees {
+package object trees extends Logging {
   // Since we want tree nodes to be lightweight, we create one logger for all treenode instances.
-  protected val logger = Logger("catalyst.trees")
+  protected override def logName = "catalyst.trees"
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
index bb77bccf86176..b52ee6d3378a3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
@@ -19,16 +19,16 @@ package org.apache.spark.sql.catalyst.types
 
 import java.sql.Timestamp
 
-import scala.util.parsing.combinator.RegexParsers
-
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{typeTag, TypeTag, runtimeMirror}
+import scala.util.parsing.combinator.RegexParsers
 
+import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
 import org.apache.spark.util.Utils
 
 /**
- *
+ * Utility functions for working with DataTypes.
  */
 object DataType extends RegexParsers {
   protected lazy val primitiveType: Parser[DataType] =
@@ -45,11 +45,13 @@ object DataType extends RegexParsers {
     "TimestampType" ^^^ TimestampType
 
   protected lazy val arrayType: Parser[DataType] =
-    "ArrayType" ~> "(" ~> dataType <~ ")" ^^ ArrayType
+    "ArrayType" ~> "(" ~> dataType ~ "," ~ boolVal <~ ")" ^^ {
+      case tpe ~ _ ~ containsNull => ArrayType(tpe, containsNull)
+    }
 
   protected lazy val mapType: Parser[DataType] =
-    "MapType" ~> "(" ~> dataType ~ "," ~ dataType <~ ")" ^^ {
-      case t1 ~ _ ~ t2 => MapType(t1, t2)
+    "MapType" ~> "(" ~> dataType ~ "," ~ dataType ~ "," ~ boolVal <~ ")" ^^ {
+      case t1 ~ _ ~ t2 ~ _ ~ valueContainsNull => MapType(t1, t2, valueContainsNull)
     }
 
   protected lazy val structField: Parser[StructField] =
@@ -62,7 +64,6 @@ object DataType extends RegexParsers {
     "true" ^^^ true |
     "false" ^^^ false
 
-
   protected lazy val structType: Parser[DataType] =
     "StructType\\([A-zA-z]*\\(".r ~> repsep(structField, ",") <~ "))" ^^ {
       case fields => new StructType(fields)
@@ -83,6 +84,21 @@ object DataType extends RegexParsers {
     case Success(result, _) => result
     case failure: NoSuccess => sys.error(s"Unsupported dataType: $asString, $failure")
   }
+
+  protected[types] def buildFormattedString(
+      dataType: DataType,
+      prefix: String,
+      builder: StringBuilder): Unit = {
+    dataType match {
+      case array: ArrayType =>
+        array.buildFormattedString(prefix, builder)
+      case struct: StructType =>
+        struct.buildFormattedString(prefix, builder)
+      case map: MapType =>
+        map.buildFormattedString(prefix, builder)
+      case _ =>
+    }
+  }
 }
 
 abstract class DataType {
@@ -93,47 +109,65 @@ abstract class DataType {
   }
 
   def isPrimitive: Boolean = false
+
+  def simpleString: String
+}
+
+case object NullType extends DataType {
+  def simpleString: String = "null"
 }
 
-case object NullType extends DataType
+object NativeType {
+  def all = Seq(
+    IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType, StringType)
+
+  def unapply(dt: DataType): Boolean = all.contains(dt)
+}
 
 trait PrimitiveType extends DataType {
   override def isPrimitive = true
 }
 
 abstract class NativeType extends DataType {
-  type JvmType
-  @transient val tag: TypeTag[JvmType]
-  val ordering: Ordering[JvmType]
+  private[sql] type JvmType
+  @transient private[sql] val tag: TypeTag[JvmType]
+  private[sql] val ordering: Ordering[JvmType]
 
-  @transient val classTag = {
+  @transient private[sql] val classTag = ScalaReflectionLock.synchronized {
     val mirror = runtimeMirror(Utils.getSparkClassLoader)
     ClassTag[JvmType](mirror.runtimeClass(tag.tpe))
   }
 }
 
 case object StringType extends NativeType with PrimitiveType {
-  type JvmType = String
-  @transient lazy val tag = typeTag[JvmType]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = String
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "string"
 }
+
 case object BinaryType extends DataType with PrimitiveType {
-  type JvmType = Array[Byte]
+  private[sql] type JvmType = Array[Byte]
+  def simpleString: String = "binary"
 }
+
 case object BooleanType extends NativeType with PrimitiveType {
-  type JvmType = Boolean
-  @transient lazy val tag = typeTag[JvmType]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Boolean
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "boolean"
 }
 
 case object TimestampType extends NativeType {
-  type JvmType = Timestamp
+  private[sql] type JvmType = Timestamp
 
-  @transient lazy val tag = typeTag[JvmType]
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
 
-  val ordering = new Ordering[JvmType] {
+  private[sql] val ordering = new Ordering[JvmType] {
     def compare(x: Timestamp, y: Timestamp) = x.compareTo(y)
   }
+
+  def simpleString: String = "timestamp"
 }
 
 abstract class NumericType extends NativeType with PrimitiveType {
@@ -142,7 +176,11 @@ abstract class NumericType extends NativeType with PrimitiveType {
   // type parameter and and add a numeric annotation (i.e., [JvmType : Numeric]). This gets
   // desugared by the compiler into an argument to the objects constructor. This means there is no
   // longer an no argument constructor and thus the JVM cannot serialize the object anymore.
-  val numeric: Numeric[JvmType]
+  private[sql] val numeric: Numeric[JvmType]
+}
+
+object NumericType {
+  def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[NumericType]
 }
 
 /** Matcher for any expressions that evaluate to [[IntegralType]]s */
@@ -154,39 +192,43 @@ object IntegralType {
 }
 
 abstract class IntegralType extends NumericType {
-  val integral: Integral[JvmType]
+  private[sql] val integral: Integral[JvmType]
 }
 
 case object LongType extends IntegralType {
-  type JvmType = Long
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[Long]]
-  val integral = implicitly[Integral[Long]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Long
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[Long]]
+  private[sql] val integral = implicitly[Integral[Long]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "long"
 }
 
 case object IntegerType extends IntegralType {
-  type JvmType = Int
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[Int]]
-  val integral = implicitly[Integral[Int]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Int
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[Int]]
+  private[sql] val integral = implicitly[Integral[Int]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "integer"
 }
 
 case object ShortType extends IntegralType {
-  type JvmType = Short
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[Short]]
-  val integral = implicitly[Integral[Short]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Short
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[Short]]
+  private[sql] val integral = implicitly[Integral[Short]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "short"
 }
 
 case object ByteType extends IntegralType {
-  type JvmType = Byte
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[Byte]]
-  val integral = implicitly[Integral[Byte]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Byte
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[Byte]]
+  private[sql] val integral = implicitly[Integral[Byte]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "byte"
 }
 
 /** Matcher for any expressions that evaluate to [[FractionalType]]s */
@@ -197,47 +239,159 @@ object FractionalType {
   }
 }
 abstract class FractionalType extends NumericType {
-  val fractional: Fractional[JvmType]
+  private[sql] val fractional: Fractional[JvmType]
 }
 
 case object DecimalType extends FractionalType {
-  type JvmType = BigDecimal
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[BigDecimal]]
-  val fractional = implicitly[Fractional[BigDecimal]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = BigDecimal
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[BigDecimal]]
+  private[sql] val fractional = implicitly[Fractional[BigDecimal]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "decimal"
 }
 
 case object DoubleType extends FractionalType {
-  type JvmType = Double
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[Double]]
-  val fractional = implicitly[Fractional[Double]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Double
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[Double]]
+  private[sql] val fractional = implicitly[Fractional[Double]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "double"
 }
 
 case object FloatType extends FractionalType {
-  type JvmType = Float
-  @transient lazy val tag = typeTag[JvmType]
-  val numeric = implicitly[Numeric[Float]]
-  val fractional = implicitly[Fractional[Float]]
-  val ordering = implicitly[Ordering[JvmType]]
+  private[sql] type JvmType = Float
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
+  private[sql] val numeric = implicitly[Numeric[Float]]
+  private[sql] val fractional = implicitly[Fractional[Float]]
+  private[sql] val ordering = implicitly[Ordering[JvmType]]
+  def simpleString: String = "float"
 }
 
-case class ArrayType(elementType: DataType) extends DataType
+object ArrayType {
+  /** Construct a [[ArrayType]] object with the given element type. The `containsNull` is false. */
+  def apply(elementType: DataType): ArrayType = ArrayType(elementType, false)
+}
 
-case class StructField(name: String, dataType: DataType, nullable: Boolean)
+/**
+ * The data type for collections of multiple values.
+ * Internally these are represented as columns that contain a ``scala.collection.Seq``.
+ *
+ * @param elementType The data type of values.
+ * @param containsNull Indicates if values have `null` values
+ */
+case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType {
+  private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
+    builder.append(
+      s"${prefix}-- element: ${elementType.simpleString} (containsNull = ${containsNull})\n")
+    DataType.buildFormattedString(elementType, s"$prefix    |", builder)
+  }
+
+  def simpleString: String = "array"
+}
+
+/**
+ * A field inside a StructType.
+ * @param name The name of this field.
+ * @param dataType The data type of this field.
+ * @param nullable Indicates if values of this field can be `null` values.
+ */
+case class StructField(name: String, dataType: DataType, nullable: Boolean) {
+
+  private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
+    builder.append(s"${prefix}-- ${name}: ${dataType.simpleString} (nullable = ${nullable})\n")
+    DataType.buildFormattedString(dataType, s"$prefix    |", builder)
+  }
+}
 
 object StructType {
-  def fromAttributes(attributes: Seq[Attribute]): StructType = {
+  protected[sql] def fromAttributes(attributes: Seq[Attribute]): StructType =
     StructType(attributes.map(a => StructField(a.name, a.dataType, a.nullable)))
-  }
 
-  // def apply(fields: Seq[StructField]) = new StructType(fields.toIndexedSeq)
+  private def validateFields(fields: Seq[StructField]): Boolean =
+    fields.map(field => field.name).distinct.size == fields.size
 }
 
 case class StructType(fields: Seq[StructField]) extends DataType {
-  def toAttributes = fields.map(f => AttributeReference(f.name, f.dataType, f.nullable)())
+  require(StructType.validateFields(fields), "Found fields with the same name.")
+
+  /**
+   * Returns all field names in a [[Seq]].
+   */
+  lazy val fieldNames: Seq[String] = fields.map(_.name)
+  private lazy val fieldNamesSet: Set[String] = fieldNames.toSet
+  private lazy val nameToField: Map[String, StructField] = fields.map(f => f.name -> f).toMap
+  /**
+   * Extracts a [[StructField]] of the given name. If the [[StructType]] object does not
+   * have a name matching the given name, `null` will be returned.
+   */
+  def apply(name: String): StructField = {
+    nameToField.get(name).getOrElse(
+      throw new IllegalArgumentException(s"Field ${name} does not exist."))
+  }
+
+  /**
+   * Returns a [[StructType]] containing [[StructField]]s of the given names.
+   * Those names which do not have matching fields will be ignored.
+   */
+  def apply(names: Set[String]): StructType = {
+    val nonExistFields = names -- fieldNamesSet
+    if (!nonExistFields.isEmpty) {
+      throw new IllegalArgumentException(
+        s"Field ${nonExistFields.mkString(",")} does not exist.")
+    }
+    // Preserve the original order of fields.
+    StructType(fields.filter(f => names.contains(f.name)))
+  }
+
+  protected[sql] def toAttributes =
+    fields.map(f => AttributeReference(f.name, f.dataType, f.nullable)())
+
+  def treeString: String = {
+    val builder = new StringBuilder
+    builder.append("root\n")
+    val prefix = " |"
+    fields.foreach(field => field.buildFormattedString(prefix, builder))
+
+    builder.toString()
+  }
+
+  def printTreeString(): Unit = println(treeString)
+
+  private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
+    fields.foreach(field => field.buildFormattedString(prefix, builder))
+  }
+
+  def simpleString: String = "struct"
 }
 
-case class MapType(keyType: DataType, valueType: DataType) extends DataType
+object MapType {
+  /**
+   * Construct a [[MapType]] object with the given key type and value type.
+   * The `valueContainsNull` is true.
+   */
+  def apply(keyType: DataType, valueType: DataType): MapType =
+    MapType(keyType: DataType, valueType: DataType, true)
+}
+
+/**
+ * The data type for Maps. Keys in a map are not allowed to have `null` values.
+ * @param keyType The data type of map keys.
+ * @param valueType The data type of map values.
+ * @param valueContainsNull Indicates if map values have `null` values.
+ */
+case class MapType(
+    keyType: DataType,
+    valueType: DataType,
+    valueContainsNull: Boolean) extends DataType {
+  private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
+    builder.append(s"${prefix}-- key: ${keyType.simpleString}\n")
+    builder.append(s"${prefix}-- value: ${valueType.simpleString} " +
+      s"(valueContainsNull = ${valueContainsNull})\n")
+    DataType.buildFormattedString(keyType, s"$prefix    |", builder)
+    DataType.buildFormattedString(valueType, s"$prefix    |", builder)
+  }
+
+  def simpleString: String = "map"
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index c0438dbe52a47..e75373d5a74a7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.catalyst
 
+import java.math.BigInteger
 import java.sql.Timestamp
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types._
 
 case class PrimitiveData(
@@ -148,4 +148,68 @@ class ScalaReflectionSuite extends FunSuite {
         StructField("_2", StringType, nullable = true))),
       nullable = true))
   }
+
+  test("get data type of a value") {
+    // BooleanType
+    assert(BooleanType === typeOfObject(true))
+    assert(BooleanType === typeOfObject(false))
+
+    // BinaryType
+    assert(BinaryType === typeOfObject("string".getBytes))
+
+    // StringType
+    assert(StringType === typeOfObject("string"))
+
+    // ByteType
+    assert(ByteType === typeOfObject(127.toByte))
+
+    // ShortType
+    assert(ShortType === typeOfObject(32767.toShort))
+
+    // IntegerType
+    assert(IntegerType === typeOfObject(2147483647))
+
+    // LongType
+    assert(LongType === typeOfObject(9223372036854775807L))
+
+    // FloatType
+    assert(FloatType === typeOfObject(3.4028235E38.toFloat))
+
+    // DoubleType
+    assert(DoubleType === typeOfObject(1.7976931348623157E308))
+
+    // DecimalType
+    assert(DecimalType === typeOfObject(BigDecimal("1.7976931348623157E318")))
+
+    // TimestampType
+    assert(TimestampType === typeOfObject(java.sql.Timestamp.valueOf("2014-07-25 10:26:00")))
+
+    // NullType
+    assert(NullType === typeOfObject(null))
+
+    def typeOfObject1: PartialFunction[Any, DataType] = typeOfObject orElse {
+      case value: java.math.BigInteger => DecimalType
+      case value: java.math.BigDecimal => DecimalType
+      case _ => StringType
+    }
+
+    assert(DecimalType === typeOfObject1(
+      new BigInteger("92233720368547758070")))
+    assert(DecimalType === typeOfObject1(
+      new java.math.BigDecimal("1.7976931348623157E318")))
+    assert(StringType === typeOfObject1(BigInt("92233720368547758070")))
+
+    def typeOfObject2: PartialFunction[Any, DataType] = typeOfObject orElse {
+      case value: java.math.BigInteger => DecimalType
+    }
+
+    intercept[MatchError](typeOfObject2(BigInt("92233720368547758070")))
+
+    def typeOfObject3: PartialFunction[Any, DataType] = typeOfObject orElse {
+      case c: Seq[_] => ArrayType(typeOfObject3(c.head))
+    }
+
+    assert(ArrayType(IntegerType) === typeOfObject3(Seq(1, 2, 3)))
+    assert(ArrayType(ArrayType(IntegerType)) === typeOfObject3(Seq(Seq(1,2,3))))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index f14df8137683b..0a4fde3de7752 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,28 +17,81 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.scalatest.FunSuite
+import org.scalatest.{BeforeAndAfter, FunSuite}
 
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.types.IntegerType
 
-/* Implicit conversions */
-import org.apache.spark.sql.catalyst.dsl.expressions._
+class AnalysisSuite extends FunSuite with BeforeAndAfter {
+  val caseSensitiveCatalog = new SimpleCatalog(true)
+  val caseInsensitiveCatalog = new SimpleCatalog(false)
+  val caseSensitiveAnalyze =
+    new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitive = true)
+  val caseInsensitiveAnalyze =
+    new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseSensitive = false)
 
-class AnalysisSuite extends FunSuite {
-  val analyze = SimpleAnalyzer
+  val testRelation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)())
 
-  val testRelation = LocalRelation('a.int)
+  before {
+    caseSensitiveCatalog.registerTable(None, "TaBlE", testRelation)
+    caseInsensitiveCatalog.registerTable(None, "TaBlE", testRelation)
+  }
 
   test("analyze project") {
     assert(
-      analyze(Project(Seq(UnresolvedAttribute("a")), testRelation)) ===
+      caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("a")), testRelation)) ===
+        Project(testRelation.output, testRelation))
+
+    assert(
+      caseSensitiveAnalyze(
+        Project(Seq(UnresolvedAttribute("TbL.a")),
+          UnresolvedRelation(None, "TaBlE", Some("TbL")))) ===
+        Project(testRelation.output, testRelation))
+
+    val e = intercept[TreeNodeException[_]] {
+      caseSensitiveAnalyze(
+        Project(Seq(UnresolvedAttribute("tBl.a")),
+          UnresolvedRelation(None, "TaBlE", Some("TbL"))))
+    }
+    assert(e.getMessage().toLowerCase.contains("unresolved"))
+
+    assert(
+      caseInsensitiveAnalyze(
+        Project(Seq(UnresolvedAttribute("TbL.a")),
+          UnresolvedRelation(None, "TaBlE", Some("TbL")))) ===
         Project(testRelation.output, testRelation))
+
+    assert(
+      caseInsensitiveAnalyze(
+        Project(Seq(UnresolvedAttribute("tBl.a")),
+          UnresolvedRelation(None, "TaBlE", Some("TbL")))) ===
+        Project(testRelation.output, testRelation))
+  }
+
+  test("resolve relations") {
+    val e = intercept[RuntimeException] {
+      caseSensitiveAnalyze(UnresolvedRelation(None, "tAbLe", None))
+    }
+    assert(e.getMessage === "Table Not Found: tAbLe")
+
+    assert(
+      caseSensitiveAnalyze(UnresolvedRelation(None, "TaBlE", None)) ===
+        testRelation)
+
+    assert(
+      caseInsensitiveAnalyze(UnresolvedRelation(None, "tAbLe", None)) ===
+        testRelation)
+
+    assert(
+      caseInsensitiveAnalyze(UnresolvedRelation(None, "TaBlE", None)) ===
+        testRelation)
   }
 
   test("throw errors for unresolved attributes during analysis") {
     val e = intercept[TreeNodeException[_]] {
-      analyze(Project(Seq(UnresolvedAttribute("abcd")), testRelation))
+      caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("abcd")), testRelation))
     }
     assert(e.getMessage().toLowerCase.contains("unresolved"))
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 84d72814778ba..999c9fff38d60 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -29,7 +29,11 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 class ExpressionEvaluationSuite extends FunSuite {
 
   test("literals") {
-    assert((Literal(1) + Literal(1)).eval(null) === 2)
+    checkEvaluation(Literal(1), 1)
+    checkEvaluation(Literal(true), true)
+    checkEvaluation(Literal(0L), 0L)
+    checkEvaluation(Literal("test"), "test")
+    checkEvaluation(Literal(1) + Literal(1), 2)
   }
 
   /**
@@ -61,10 +65,8 @@ class ExpressionEvaluationSuite extends FunSuite {
   test("3VL Not") {
     notTrueTable.foreach {
       case (v, answer) =>
-        val expr = ! Literal(v, BooleanType)
-        val result = expr.eval(null)
-        if (result != answer)
-          fail(s"$expr should not evaluate to $result, expected: $answer")    }
+        checkEvaluation(!Literal(v, BooleanType), answer)
+    }
   }
 
   booleanLogicTest("AND", _ && _,
@@ -127,6 +129,13 @@ class ExpressionEvaluationSuite extends FunSuite {
     }
   }
 
+  test("IN") {
+    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
+    checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
+    checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
+    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))) && In(Literal(2), Seq(Literal(1), Literal(2))), true)
+  }
+
   test("LIKE literal Regular Expression") {
     checkEvaluation(Literal(null, StringType).like("a"), null)
     checkEvaluation(Literal("a", StringType).like(Literal(null, StringType)), null)
@@ -158,7 +167,7 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation("abc" like regEx, true, new GenericRow(Array[Any]("a%")))
     checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("b%")))
     checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("bc%")))
-    
+
     checkEvaluation(Literal(null, StringType) like regEx, null, new GenericRow(Array[Any]("bc%")))
   }
 
@@ -203,7 +212,7 @@ class ExpressionEvaluationSuite extends FunSuite {
 
   test("data type casting") {
 
-    val sts = "1970-01-01 00:00:01.0"
+    val sts = "1970-01-01 00:00:01.1"
     val ts = Timestamp.valueOf(sts)
 
     checkEvaluation("abdef" cast StringType, "abdef")
@@ -232,21 +241,21 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(Literal(false) cast IntegerType, 0)
     checkEvaluation(Cast(Literal(1) cast BooleanType, IntegerType), 1)
     checkEvaluation(Cast(Literal(0) cast BooleanType, IntegerType), 0)
-    checkEvaluation("23" cast DoubleType, 23)
+    checkEvaluation("23" cast DoubleType, 23d)
     checkEvaluation("23" cast IntegerType, 23)
-    checkEvaluation("23" cast FloatType, 23)
-    checkEvaluation("23" cast DecimalType, 23)
-    checkEvaluation("23" cast ByteType, 23)
-    checkEvaluation("23" cast ShortType, 23)
+    checkEvaluation("23" cast FloatType, 23f)
+    checkEvaluation("23" cast DecimalType, 23: BigDecimal)
+    checkEvaluation("23" cast ByteType, 23.toByte)
+    checkEvaluation("23" cast ShortType, 23.toShort)
     checkEvaluation("2012-12-11" cast DoubleType, null)
     checkEvaluation(Literal(123) cast IntegerType, 123)
 
-    checkEvaluation(Literal(23d) + Cast(true, DoubleType), 24)
+    checkEvaluation(Literal(23d) + Cast(true, DoubleType), 24d)
     checkEvaluation(Literal(23) + Cast(true, IntegerType), 24)
-    checkEvaluation(Literal(23f) + Cast(true, FloatType), 24)
-    checkEvaluation(Literal(BigDecimal(23)) + Cast(true, DecimalType), 24)
-    checkEvaluation(Literal(23.toByte) + Cast(true, ByteType), 24)
-    checkEvaluation(Literal(23.toShort) + Cast(true, ShortType), 24)
+    checkEvaluation(Literal(23f) + Cast(true, FloatType), 24f)
+    checkEvaluation(Literal(BigDecimal(23)) + Cast(true, DecimalType), 24: BigDecimal)
+    checkEvaluation(Literal(23.toByte) + Cast(true, ByteType), 24.toByte)
+    checkEvaluation(Literal(23.toShort) + Cast(true, ShortType), 24.toShort)
 
     intercept[Exception] {evaluate(Literal(1) cast BinaryType, null)}
 
@@ -293,7 +302,7 @@ class ExpressionEvaluationSuite extends FunSuite {
     // A test for higher precision than millis
     checkEvaluation(Cast(Cast(0.00000001, TimestampType), DoubleType), 0.00000001)
   }
-  
+
   test("null checking") {
     val row = new GenericRow(Array[Any]("^Ba*n", null, true, null))
     val c1 = 'a.string.at(0)
@@ -301,18 +310,18 @@ class ExpressionEvaluationSuite extends FunSuite {
     val c3 = 'a.boolean.at(2)
     val c4 = 'a.boolean.at(3)
 
-    checkEvaluation(IsNull(c1), false, row)
-    checkEvaluation(IsNotNull(c1), true, row)
+    checkEvaluation(c1.isNull, false, row)
+    checkEvaluation(c1.isNotNull, true, row)
 
-    checkEvaluation(IsNull(c2), true, row)
-    checkEvaluation(IsNotNull(c2), false, row)
+    checkEvaluation(c2.isNull, true, row)
+    checkEvaluation(c2.isNotNull, false, row)
 
-    checkEvaluation(IsNull(Literal(1, ShortType)), false)
-    checkEvaluation(IsNotNull(Literal(1, ShortType)), true)
+    checkEvaluation(Literal(1, ShortType).isNull, false)
+    checkEvaluation(Literal(1, ShortType).isNotNull, true)
+
+    checkEvaluation(Literal(null, ShortType).isNull, true)
+    checkEvaluation(Literal(null, ShortType).isNotNull, false)
 
-    checkEvaluation(IsNull(Literal(null, ShortType)), true)
-    checkEvaluation(IsNotNull(Literal(null, ShortType)), false)
-    
     checkEvaluation(Coalesce(c1 :: c2 :: Nil), "^Ba*n", row)
     checkEvaluation(Coalesce(Literal(null, StringType) :: Nil), null, row)
     checkEvaluation(Coalesce(Literal(null, StringType) :: c1 :: c2 :: Nil), "^Ba*n", row)
@@ -323,14 +332,14 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(If(Literal(null, BooleanType), c2, c1), "^Ba*n", row)
     checkEvaluation(If(Literal(true, BooleanType), c1, c2), "^Ba*n", row)
     checkEvaluation(If(Literal(false, BooleanType), c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal(false, BooleanType), 
+    checkEvaluation(If(Literal(false, BooleanType),
       Literal("a", StringType), Literal("b", StringType)), "b", row)
 
-    checkEvaluation(In(c1, c1 :: c2 :: Nil), true, row)
-    checkEvaluation(In(Literal("^Ba*n", StringType), 
-      Literal("^Ba*n", StringType) :: Nil), true, row)
-    checkEvaluation(In(Literal("^Ba*n", StringType),
-      Literal("^Ba*n", StringType) :: c2 :: Nil), true, row)
+    checkEvaluation(c1 in (c1, c2), true, row)
+    checkEvaluation(
+      Literal("^Ba*n", StringType) in (Literal("^Ba*n", StringType)), true, row)
+    checkEvaluation(
+      Literal("^Ba*n", StringType) in (Literal("^Ba*n", StringType), c2), true, row)
   }
 
   test("case when") {
@@ -378,7 +387,7 @@ class ExpressionEvaluationSuite extends FunSuite {
 
   test("complex type") {
     val row = new GenericRow(Array[Any](
-      "^Ba*n",                                  // 0 
+      "^Ba*n",                                  // 0
       null.asInstanceOf[String],                // 1
       new GenericRow(Array[Any]("aa", "bb")),   // 2
       Map("aa"->"bb"),                          // 3
@@ -391,21 +400,21 @@ class ExpressionEvaluationSuite extends FunSuite {
     val typeMap = MapType(StringType, StringType)
     val typeArray = ArrayType(StringType)
 
-    checkEvaluation(GetItem(BoundReference(3, AttributeReference("c", typeMap)()), 
+    checkEvaluation(GetItem(BoundReference(3, typeMap, true),
       Literal("aa")), "bb", row)
     checkEvaluation(GetItem(Literal(null, typeMap), Literal("aa")), null, row)
     checkEvaluation(GetItem(Literal(null, typeMap), Literal(null, StringType)), null, row)
-    checkEvaluation(GetItem(BoundReference(3, AttributeReference("c", typeMap)()), 
+    checkEvaluation(GetItem(BoundReference(3, typeMap, true),
       Literal(null, StringType)), null, row)
 
-    checkEvaluation(GetItem(BoundReference(4, AttributeReference("c", typeArray)()), 
+    checkEvaluation(GetItem(BoundReference(4, typeArray, true),
       Literal(1)), "bb", row)
     checkEvaluation(GetItem(Literal(null, typeArray), Literal(1)), null, row)
     checkEvaluation(GetItem(Literal(null, typeArray), Literal(null, IntegerType)), null, row)
-    checkEvaluation(GetItem(BoundReference(4, AttributeReference("c", typeArray)()), 
+    checkEvaluation(GetItem(BoundReference(4, typeArray, true),
       Literal(null, IntegerType)), null, row)
 
-    checkEvaluation(GetField(BoundReference(2, AttributeReference("c", typeS)()), "a"), "aa", row)
+    checkEvaluation(GetField(BoundReference(2, typeS, nullable = true), "a"), "aa", row)
     checkEvaluation(GetField(Literal(null, typeS), "a"), null, row)
 
     val typeS_notNullable = StructType(
@@ -413,13 +422,15 @@ class ExpressionEvaluationSuite extends FunSuite {
         :: StructField("b", StringType, nullable = false) :: Nil
     )
 
-    assert(GetField(BoundReference(2,
-      AttributeReference("c", typeS)()), "a").nullable === true)
-    assert(GetField(BoundReference(2,
-      AttributeReference("c", typeS_notNullable, nullable = false)()), "a").nullable === false)
+    assert(GetField(BoundReference(2,typeS, nullable = true), "a").nullable === true)
+    assert(GetField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable === false)
 
     assert(GetField(Literal(null, typeS), "a").nullable === true)
     assert(GetField(Literal(null, typeS_notNullable), "a").nullable === true)
+
+    checkEvaluation('c.map(typeMap).at(3).getItem("aa"), "bb", row)
+    checkEvaluation('c.array(typeArray.elementType).at(4).getItem(1), "bb", row)
+    checkEvaluation('c.struct(typeS).at(2).getField("a"), "aa", row)
   }
 
   test("arithmetic") {
@@ -447,11 +458,13 @@ class ExpressionEvaluationSuite extends FunSuite {
   }
 
   test("BinaryComparison") {
-    val row = new GenericRow(Array[Any](1, 2, 3, null))
+    val row = new GenericRow(Array[Any](1, 2, 3, null, 3, null))
     val c1 = 'a.int.at(0)
     val c2 = 'a.int.at(1)
     val c3 = 'a.int.at(2)
     val c4 = 'a.int.at(3)
+    val c5 = 'a.int.at(4)
+    val c6 = 'a.int.at(5)
 
     checkEvaluation(LessThan(c1, c4), null, row)
     checkEvaluation(LessThan(c1, c2), true, row)
@@ -465,6 +478,93 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(c1 >= c2, false, row)
     checkEvaluation(c1 === c2, false, row)
     checkEvaluation(c1 !== c2, true, row)
+    checkEvaluation(c4 <=> c1, false, row)
+    checkEvaluation(c1 <=> c4, false, row)
+    checkEvaluation(c4 <=> c6, true, row)
+    checkEvaluation(c3 <=> c5, true, row)
+    checkEvaluation(Literal(true) <=> Literal(null, BooleanType), false, row)
+    checkEvaluation(Literal(null, BooleanType) <=> Literal(true), false, row)
   }
-}
 
+  test("StringComparison") {
+    val row = new GenericRow(Array[Any]("abc", null))
+    val c1 = 'a.string.at(0)
+    val c2 = 'a.string.at(1)
+
+    checkEvaluation(c1 contains "b", true, row)
+    checkEvaluation(c1 contains "x", false, row)
+    checkEvaluation(c2 contains "b", null, row)
+    checkEvaluation(c1 contains Literal(null, StringType), null, row)
+
+    checkEvaluation(c1 startsWith "a", true, row)
+    checkEvaluation(c1 startsWith "b", false, row)
+    checkEvaluation(c2 startsWith "a", null, row)
+    checkEvaluation(c1 startsWith Literal(null, StringType), null, row)
+
+    checkEvaluation(c1 endsWith "c", true, row)
+    checkEvaluation(c1 endsWith "b", false, row)
+    checkEvaluation(c2 endsWith "b", null, row)
+    checkEvaluation(c1 endsWith Literal(null, StringType), null, row)
+  }
+
+  test("Substring") {
+    val row = new GenericRow(Array[Any]("example", "example".toArray.map(_.toByte)))
+
+    val s = 'a.string.at(0)
+
+    // substring from zero position with less-than-full length
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(2, IntegerType)), "ex", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(2, IntegerType)), "ex", row)
+
+    // substring from zero position with full length
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(7, IntegerType)), "example", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(7, IntegerType)), "example", row)
+
+    // substring from zero position with greater-than-full length
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(100, IntegerType)), "example", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(100, IntegerType)), "example", row)
+
+    // substring from nonzero position with less-than-full length
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(2, IntegerType)), "xa", row)
+
+    // substring from nonzero position with full length
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(6, IntegerType)), "xample", row)
+
+    // substring from nonzero position with greater-than-full length
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(100, IntegerType)), "xample", row)
+
+    // zero-length substring (within string bounds)
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(0, IntegerType)), "", row)
+
+    // zero-length substring (beyond string bounds)
+    checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), "", row)
+
+    // substring(null, _, _) -> null
+    checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), null, new GenericRow(Array[Any](null)))
+
+    // substring(_, null, _) -> null
+    checkEvaluation(Substring(s, Literal(null, IntegerType), Literal(4, IntegerType)), null, row)
+
+    // substring(_, _, null) -> null
+    checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(null, IntegerType)), null, row)
+
+    // 2-arg substring from zero position
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)
+
+    // 2-arg substring from nonzero position
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "xample", row)
+
+    val s_notNull = 'a.string.notNull.at(0)
+
+    assert(Substring(s, Literal(0, IntegerType), Literal(2, IntegerType)).nullable === true)
+    assert(Substring(s_notNull, Literal(0, IntegerType), Literal(2, IntegerType)).nullable === false)
+    assert(Substring(s_notNull, Literal(null, IntegerType), Literal(2, IntegerType)).nullable === true)
+    assert(Substring(s_notNull, Literal(0, IntegerType), Literal(null, IntegerType)).nullable === true)
+
+    checkEvaluation(s.substr(0, 2), "ex", row)
+    checkEvaluation(s.substr(0), "example", row)
+    checkEvaluation(s.substring(0, 2), "ex", row)
+    checkEvaluation(s.substring(0), "example", row)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
new file mode 100644
index 0000000000000..245a2e148030c
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedEvaluationSuite.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen._
+
+/**
+ * Overrides our expression evaluation tests to use code generation for evaluation.
+ */
+class GeneratedEvaluationSuite extends ExpressionEvaluationSuite {
+  override def checkEvaluation(
+      expression: Expression,
+      expected: Any,
+      inputRow: Row = EmptyRow): Unit = {
+    val plan = try {
+      GenerateMutableProjection(Alias(expression, s"Optimized($expression)")() :: Nil)()
+    } catch {
+      case e: Throwable =>
+        val evaluated = GenerateProjection.expressionEvaluator(expression)
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code.mkString("\n")}
+            |$e
+          """.stripMargin)
+    }
+
+    val actual  = plan(inputRow).apply(0)
+    if(actual != expected) {
+      val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
+
+  test("multithreaded eval") {
+    import scala.concurrent._
+    import ExecutionContext.Implicits.global
+    import scala.concurrent.duration._
+
+    val futures = (1 to 20).map { _ =>
+      future {
+        GeneratePredicate(EqualTo(Literal(1), Literal(1)))
+        GenerateProjection(EqualTo(Literal(1), Literal(1)) :: Nil)
+        GenerateMutableProjection(EqualTo(Literal(1), Literal(1)) :: Nil)
+        GenerateOrdering(Add(Literal(1), Literal(1)).asc :: Nil)
+      }
+    }
+
+    futures.foreach(Await.result(_, 10.seconds))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
new file mode 100644
index 0000000000000..887aabb1d5fb4
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/GeneratedMutableEvaluationSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen._
+
+/**
+ * Overrides our expression evaluation tests to use generated code on mutable rows.
+ */
+class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite {
+  override def checkEvaluation(
+                                expression: Expression,
+                                expected: Any,
+                                inputRow: Row = EmptyRow): Unit = {
+    lazy val evaluated = GenerateProjection.expressionEvaluator(expression)
+
+    val plan = try {
+      GenerateProjection(Alias(expression, s"Optimized($expression)")() :: Nil)
+    } catch {
+      case e: Throwable =>
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code.mkString("\n")}
+            |$e
+          """.stripMargin)
+    }
+
+    val actual = plan(inputRow)
+    val expectedRow = new GenericRow(Array[Any](expected))
+    if (actual.hashCode() != expectedRow.hashCode()) {
+      fail(
+        s"""
+          |Mismatched hashCodes for values: $actual, $expectedRow
+          |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()}
+          |${evaluated.code.mkString("\n")}
+        """.stripMargin)
+    }
+    if (actual != expectedRow) {
+      val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
index 4896f1b955f01..e2ae0d25db1a5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
@@ -27,9 +27,9 @@ class CombiningLimitsSuite extends PlanTest {
 
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
-      Batch("Combine Limit", FixedPoint(2),
+      Batch("Combine Limit", FixedPoint(10),
         CombineLimits) ::
-      Batch("Constant Folding", FixedPoint(3),
+      Batch("Constant Folding", FixedPoint(10),
         NullPropagation,
         ConstantFolding,
         BooleanSimplification) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index 0ff82064012a8..0a27cce337482 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -83,7 +83,7 @@ class ConstantFoldingSuite extends PlanTest {
           Literal(10) as Symbol("2*3+4"),
           Literal(14) as Symbol("2*(3+4)"))
         .where(Literal(true))
-        .groupBy(Literal(3))(Literal(3) as Symbol("9/3"))
+        .groupBy(Literal(3.0))(Literal(3.0) as Symbol("9/3"))
         .analyze
 
     comparePlans(optimized, correctAnswer)
@@ -201,7 +201,15 @@ class ConstantFoldingSuite extends PlanTest {
           Like(Literal(null, StringType), "abc") as 'c13,
           Like("abc", Literal(null, StringType)) as 'c14,
 
-          Upper(Literal(null, StringType)) as 'c15)
+          Upper(Literal(null, StringType)) as 'c15,
+
+          Substring(Literal(null, StringType), 0, 1) as 'c16,
+          Substring("abc", Literal(null, IntegerType), 1) as 'c17,
+          Substring("abc", 0, Literal(null, IntegerType)) as 'c18,
+
+          Contains(Literal(null, StringType), "abc") as 'c19,
+          Contains("abc", Literal(null, StringType)) as 'c20
+        )
 
     val optimized = Optimize(originalQuery.analyze)
 
@@ -228,8 +236,15 @@ class ConstantFoldingSuite extends PlanTest {
           Literal(null, BooleanType) as 'c13,
           Literal(null, BooleanType) as 'c14,
 
-          Literal(null, StringType) as 'c15)
-        .analyze
+          Literal(null, StringType) as 'c15,
+
+          Literal(null, StringType) as 'c16,
+          Literal(null, StringType) as 'c17,
+          Literal(null, StringType) as 'c18,
+
+          Literal(null, BooleanType) as 'c19,
+          Literal(null, BooleanType) as 'c20
+        ).analyze
 
     comparePlans(optimized, correctAnswer)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
new file mode 100644
index 0000000000000..b10577c8001e2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.rules._
+
+/* Implicit conversions */
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+
+class LikeSimplificationSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Like Simplification", Once,
+        LikeSimplification) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.string)
+
+  test("simplify Like into StartsWith") {
+    val originalQuery =
+      testRelation
+        .where(('a like "abc%") || ('a like "abc\\%"))
+
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer = testRelation
+      .where(StartsWith('a, "abc") || ('a like "abc\\%"))
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("simplify Like into EndsWith") {
+    val originalQuery =
+      testRelation
+        .where('a like "%xyz")
+
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer = testRelation
+      .where(EndsWith('a, "xyz"))
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("simplify Like into Contains") {
+    val originalQuery =
+      testRelation
+        .where(('a like "%mn%") || ('a like "%mn\\%"))
+
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer = testRelation
+      .where(Contains('a, "mn") || ('a like "%mn\\%"))
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("simplify Like into EqualTo") {
+    val originalQuery =
+      testRelation
+        .where(('a like "") || ('a like "abc"))
+
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer = testRelation
+      .where(('a === "") || ('a === "abc"))
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+}
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 8210fd1f210d1..c8016e41256d5 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -31,6 +31,9 @@
   <packaging>jar</packaging>
   <name>Spark Project SQL</name>
   <url>http://spark.apache.org/</url>
+  <properties>
+    <sbt.project.name>sql</sbt.project.name>
+  </properties>
 
   <dependencies>
     <dependency>
@@ -65,6 +68,11 @@
       <artifactId>jackson-databind</artifactId>
       <version>2.3.0</version>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/ArrayType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/ArrayType.java
new file mode 100644
index 0000000000000..b73a371e93001
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/ArrayType.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing Lists.
+ * An ArrayType object comprises two fields, {@code DataType elementType} and
+ * {@code boolean containsNull}. The field of {@code elementType} is used to specify the type of
+ * array elements. The field of {@code containsNull} is used to specify if the array has
+ * {@code null} values.
+ *
+ * To create an {@link ArrayType},
+ * {@link DataType#createArrayType(DataType)} or
+ * {@link DataType#createArrayType(DataType, boolean)}
+ * should be used.
+ */
+public class ArrayType extends DataType {
+  private DataType elementType;
+  private boolean containsNull;
+
+  protected ArrayType(DataType elementType, boolean containsNull) {
+    this.elementType = elementType;
+    this.containsNull = containsNull;
+  }
+
+  public DataType getElementType() {
+    return elementType;
+  }
+
+  public boolean isContainsNull() {
+    return containsNull;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    ArrayType arrayType = (ArrayType) o;
+
+    if (containsNull != arrayType.containsNull) return false;
+    if (!elementType.equals(arrayType.elementType)) return false;
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = elementType.hashCode();
+    result = 31 * result + (containsNull ? 1 : 0);
+    return result;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/BinaryType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/BinaryType.java
new file mode 100644
index 0000000000000..7daad60f62a0b
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/BinaryType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing byte[] values.
+ *
+ * {@code BinaryType} is represented by the singleton object {@link DataType#BinaryType}.
+ */
+public class BinaryType extends DataType {
+  protected BinaryType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/BooleanType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/BooleanType.java
new file mode 100644
index 0000000000000..5a1f52725631b
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/BooleanType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing boolean and Boolean values.
+ *
+ * {@code BooleanType} is represented by the singleton object {@link DataType#BooleanType}.
+ */
+public class BooleanType extends DataType {
+  protected BooleanType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/ByteType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/ByteType.java
new file mode 100644
index 0000000000000..e5cdf06b21bbe
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/ByteType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing byte and Byte values.
+ *
+ * {@code ByteType} is represented by the singleton object {@link DataType#ByteType}.
+ */
+public class ByteType extends DataType {
+  protected ByteType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DataType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/DataType.java
new file mode 100644
index 0000000000000..3eccddef88134
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/DataType.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * The base type of all Spark SQL data types.
+ *
+ * To get/create specific data type, users should use singleton objects and factory methods
+ * provided by this class.
+ */
+public abstract class DataType {
+
+  /**
+   * Gets the StringType object.
+   */
+  public static final StringType StringType = new StringType();
+
+  /**
+   * Gets the BinaryType object.
+   */
+  public static final BinaryType BinaryType = new BinaryType();
+
+  /**
+   * Gets the BooleanType object.
+   */
+  public static final BooleanType BooleanType = new BooleanType();
+
+  /**
+   * Gets the TimestampType object.
+   */
+  public static final TimestampType TimestampType = new TimestampType();
+
+  /**
+   * Gets the DecimalType object.
+   */
+  public static final DecimalType DecimalType = new DecimalType();
+
+  /**
+   * Gets the DoubleType object.
+   */
+  public static final DoubleType DoubleType = new DoubleType();
+
+  /**
+   * Gets the FloatType object.
+   */
+  public static final FloatType FloatType = new FloatType();
+
+  /**
+   * Gets the ByteType object.
+   */
+  public static final ByteType ByteType = new ByteType();
+
+  /**
+   * Gets the IntegerType object.
+   */
+  public static final IntegerType IntegerType = new IntegerType();
+
+  /**
+   * Gets the LongType object.
+   */
+  public static final LongType LongType = new LongType();
+
+  /**
+   * Gets the ShortType object.
+   */
+  public static final ShortType ShortType = new ShortType();
+
+  /**
+   * Creates an ArrayType by specifying the data type of elements ({@code elementType}).
+   * The field of {@code containsNull} is set to {@code false}.
+   */
+  public static ArrayType createArrayType(DataType elementType) {
+    if (elementType == null) {
+      throw new IllegalArgumentException("elementType should not be null.");
+    }
+
+    return new ArrayType(elementType, false);
+  }
+
+  /**
+   * Creates an ArrayType by specifying the data type of elements ({@code elementType}) and
+   * whether the array contains null values ({@code containsNull}).
+   */
+  public static ArrayType createArrayType(DataType elementType, boolean containsNull) {
+    if (elementType == null) {
+      throw new IllegalArgumentException("elementType should not be null.");
+    }
+
+    return new ArrayType(elementType, containsNull);
+  }
+
+  /**
+   * Creates a MapType by specifying the data type of keys ({@code keyType}) and values
+   * ({@code keyType}). The field of {@code valueContainsNull} is set to {@code true}.
+   */
+  public static MapType createMapType(DataType keyType, DataType valueType) {
+    if (keyType == null) {
+      throw new IllegalArgumentException("keyType should not be null.");
+    }
+    if (valueType == null) {
+      throw new IllegalArgumentException("valueType should not be null.");
+    }
+
+    return new MapType(keyType, valueType, true);
+  }
+
+  /**
+   * Creates a MapType by specifying the data type of keys ({@code keyType}), the data type of
+   * values ({@code keyType}), and whether values contain any null value
+   * ({@code valueContainsNull}).
+   */
+  public static MapType createMapType(
+      DataType keyType,
+      DataType valueType,
+      boolean valueContainsNull) {
+    if (keyType == null) {
+      throw new IllegalArgumentException("keyType should not be null.");
+    }
+    if (valueType == null) {
+      throw new IllegalArgumentException("valueType should not be null.");
+    }
+
+    return new MapType(keyType, valueType, valueContainsNull);
+  }
+
+  /**
+   * Creates a StructField by specifying the name ({@code name}), data type ({@code dataType}) and
+   * whether values of this field can be null values ({@code nullable}).
+   */
+  public static StructField createStructField(String name, DataType dataType, boolean nullable) {
+    if (name == null) {
+      throw new IllegalArgumentException("name should not be null.");
+    }
+    if (dataType == null) {
+      throw new IllegalArgumentException("dataType should not be null.");
+    }
+
+    return new StructField(name, dataType, nullable);
+  }
+
+  /**
+   * Creates a StructType with the given list of StructFields ({@code fields}).
+   */
+  public static StructType createStructType(List<StructField> fields) {
+    return createStructType(fields.toArray(new StructField[0]));
+  }
+
+  /**
+   * Creates a StructType with the given StructField array ({@code fields}).
+   */
+  public static StructType createStructType(StructField[] fields) {
+    if (fields == null) {
+      throw new IllegalArgumentException("fields should not be null.");
+    }
+    Set<String> distinctNames = new HashSet<String>();
+    for (StructField field: fields) {
+      if (field == null) {
+        throw new IllegalArgumentException(
+          "fields should not contain any null.");
+      }
+
+      distinctNames.add(field.getName());
+    }
+    if (distinctNames.size() != fields.length) {
+      throw new IllegalArgumentException("fields should have distinct names.");
+    }
+
+    return new StructType(fields);
+  }
+
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DecimalType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/DecimalType.java
new file mode 100644
index 0000000000000..bc54c078d7a4e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/DecimalType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing java.math.BigDecimal values.
+ *
+ * {@code DecimalType} is represented by the singleton object {@link DataType#DecimalType}.
+ */
+public class DecimalType extends DataType {
+  protected DecimalType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/DoubleType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/DoubleType.java
new file mode 100644
index 0000000000000..f0060d0bcf9f5
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/DoubleType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing double and Double values.
+ *
+ * {@code DoubleType} is represented by the singleton object {@link DataType#DoubleType}.
+ */
+public class DoubleType extends DataType {
+  protected DoubleType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/FloatType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/FloatType.java
new file mode 100644
index 0000000000000..4a6a37f69176a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/FloatType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing float and Float values.
+ *
+ * {@code FloatType} is represented by the singleton object {@link DataType#FloatType}.
+ */
+public class FloatType extends DataType {
+  protected FloatType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/IntegerType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/IntegerType.java
new file mode 100644
index 0000000000000..bfd70490bbbbb
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/IntegerType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing int and Integer values.
+ *
+ * {@code IntegerType} is represented by the singleton object {@link DataType#IntegerType}.
+ */
+public class IntegerType extends DataType {
+  protected IntegerType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/LongType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/LongType.java
new file mode 100644
index 0000000000000..af13a46eb165c
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/LongType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing long and Long values.
+ *
+ * {@code LongType} is represented by the singleton object {@link DataType#LongType}.
+ */
+public class LongType extends DataType {
+  protected LongType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/MapType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/MapType.java
new file mode 100644
index 0000000000000..063e6b34abc48
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/MapType.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing Maps. A MapType object comprises two fields,
+ * {@code DataType keyType}, {@code DataType valueType}, and {@code boolean valueContainsNull}.
+ * The field of {@code keyType} is used to specify the type of keys in the map.
+ * The field of {@code valueType} is used to specify the type of values in the map.
+ * The field of {@code valueContainsNull} is used to specify if map values have
+ * {@code null} values.
+ * For values of a MapType column, keys are not allowed to have {@code null} values.
+ *
+ * To create a {@link MapType},
+ * {@link DataType#createMapType(DataType, DataType)} or
+ * {@link DataType#createMapType(DataType, DataType, boolean)}
+ * should be used.
+ */
+public class MapType extends DataType {
+  private DataType keyType;
+  private DataType valueType;
+  private boolean valueContainsNull;
+
+  protected MapType(DataType keyType, DataType valueType, boolean valueContainsNull) {
+    this.keyType = keyType;
+    this.valueType = valueType;
+    this.valueContainsNull = valueContainsNull;
+  }
+
+  public DataType getKeyType() {
+    return keyType;
+  }
+
+  public DataType getValueType() {
+    return valueType;
+  }
+
+  public boolean isValueContainsNull() {
+    return valueContainsNull;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    MapType mapType = (MapType) o;
+
+    if (valueContainsNull != mapType.valueContainsNull) return false;
+    if (!keyType.equals(mapType.keyType)) return false;
+    if (!valueType.equals(mapType.valueType)) return false;
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = keyType.hashCode();
+    result = 31 * result + valueType.hashCode();
+    result = 31 * result + (valueContainsNull ? 1 : 0);
+    return result;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/ShortType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/ShortType.java
new file mode 100644
index 0000000000000..7d7604b4e3d2d
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/ShortType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing short and Short values.
+ *
+ * {@code ShortType} is represented by the singleton object {@link DataType#ShortType}.
+ */
+public class ShortType extends DataType {
+  protected ShortType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/StringType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/StringType.java
new file mode 100644
index 0000000000000..f4ba0c07c9c6e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/StringType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing String values.
+ *
+ * {@code StringType} is represented by the singleton object {@link DataType#StringType}.
+ */
+public class StringType extends DataType {
+  protected StringType() {}
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/StructField.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/StructField.java
new file mode 100644
index 0000000000000..b48e2a2c5f953
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/StructField.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * A StructField object represents a field in a StructType object.
+ * A StructField object comprises three fields, {@code String name}, {@code DataType dataType},
+ * and {@code boolean nullable}. The field of {@code name} is the name of a StructField.
+ * The field of {@code dataType} specifies the data type of a StructField.
+ * The field of {@code nullable} specifies if values of a StructField can contain {@code null}
+ * values.
+ *
+ * To create a {@link StructField},
+ * {@link DataType#createStructField(String, DataType, boolean)}
+ * should be used.
+ */
+public class StructField {
+  private String name;
+  private DataType dataType;
+  private boolean nullable;
+
+  protected StructField(String name, DataType dataType, boolean nullable) {
+    this.name = name;
+    this.dataType = dataType;
+    this.nullable = nullable;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public DataType getDataType() {
+    return dataType;
+  }
+
+  public boolean isNullable() {
+    return nullable;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    StructField that = (StructField) o;
+
+    if (nullable != that.nullable) return false;
+    if (!dataType.equals(that.dataType)) return false;
+    if (!name.equals(that.name)) return false;
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = name.hashCode();
+    result = 31 * result + dataType.hashCode();
+    result = 31 * result + (nullable ? 1 : 0);
+    return result;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/StructType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/StructType.java
new file mode 100644
index 0000000000000..a4b501efd9a10
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/StructType.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.util.Arrays;
+
+/**
+ * The data type representing Rows.
+ * A StructType object comprises an array of StructFields.
+ *
+ * To create an {@link StructType},
+ * {@link DataType#createStructType(java.util.List)} or
+ * {@link DataType#createStructType(StructField[])}
+ * should be used.
+ */
+public class StructType extends DataType {
+  private StructField[] fields;
+
+  protected StructType(StructField[] fields) {
+    this.fields = fields;
+  }
+
+  public StructField[] getFields() {
+    return fields;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    StructType that = (StructType) o;
+
+    if (!Arrays.equals(fields, that.fields)) return false;
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(fields);
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/TimestampType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/TimestampType.java
new file mode 100644
index 0000000000000..06d44c731cdfe
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/TimestampType.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+/**
+ * The data type representing java.sql.Timestamp values.
+ *
+ * {@code TimestampType} is represented by the singleton object {@link DataType#TimestampType}.
+ */
+public class TimestampType extends DataType {
+  protected TimestampType() {}
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorSummary.scala b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
similarity index 64%
rename from core/src/main/scala/org/apache/spark/ui/jobs/ExecutorSummary.scala
rename to sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
index c4a8996c0b9a9..ef959e35e1027 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorSummary.scala
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
@@ -15,22 +15,18 @@
  * limitations under the License.
  */
 
-package org.apache.spark.ui.jobs
+package org.apache.spark.sql.api.java;
 
-import org.apache.spark.annotation.DeveloperApi
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
 
 /**
- * :: DeveloperApi ::
- * Class for reporting aggregated metrics for each executor in stage UI.
+ * A Spark SQL UDF that has 1 arguments.
  */
-@DeveloperApi
-class ExecutorSummary {
-  var taskTime : Long = 0
-  var failedTasks : Int = 0
-  var succeededTasks : Int = 0
-  var inputBytes: Long = 0
-  var shuffleRead : Long = 0
-  var shuffleWrite : Long = 0
-  var memoryBytesSpilled : Long = 0
-  var diskBytesSpilled : Long = 0
+public interface UDF1<T1, R> extends Serializable {
+  public R call(T1 t1) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
new file mode 100644
index 0000000000000..96ab3a96c3d5e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 10 arguments.
+ */
+public interface UDF10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
new file mode 100644
index 0000000000000..58ae8edd6d817
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 11 arguments.
+ */
+public interface UDF11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
new file mode 100644
index 0000000000000..d9da0f6eddd94
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 12 arguments.
+ */
+public interface UDF12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
new file mode 100644
index 0000000000000..095fc1a8076b5
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 13 arguments.
+ */
+public interface UDF13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
new file mode 100644
index 0000000000000..eb27eaa180086
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 14 arguments.
+ */
+public interface UDF14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
new file mode 100644
index 0000000000000..1fbcff56332b6
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 15 arguments.
+ */
+public interface UDF15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
new file mode 100644
index 0000000000000..1133561787a69
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 16 arguments.
+ */
+public interface UDF16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
new file mode 100644
index 0000000000000..dfae7922c9b63
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 17 arguments.
+ */
+public interface UDF17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
new file mode 100644
index 0000000000000..e9d1c6d52d4ea
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 18 arguments.
+ */
+public interface UDF18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
new file mode 100644
index 0000000000000..46b9d2d3c9457
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 19 arguments.
+ */
+public interface UDF19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19) throws Exception;
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverter.scala b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
similarity index 63%
rename from examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverter.scala
rename to sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
index 42ae960bd64a1..cd3fde8da419e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/HBaseConverter.scala
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
@@ -15,19 +15,18 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.pythonconverters
+package org.apache.spark.sql.api.java;
 
-import org.apache.spark.api.python.Converter
-import org.apache.hadoop.hbase.client.Result
-import org.apache.hadoop.hbase.util.Bytes
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
 
 /**
- * Implementation of [[org.apache.spark.api.python.Converter]] that converts a HBase Result
- * to a String
+ * A Spark SQL UDF that has 2 arguments.
  */
-class HBaseConverter extends Converter[Any, String] {
-  override def convert(obj: Any): String = {
-    val result = obj.asInstanceOf[Result]
-    Bytes.toStringBinary(result.value())
-  }
+public interface UDF2<T1, T2, R> extends Serializable {
+  public R call(T1 t1, T2 t2) throws Exception;
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
new file mode 100644
index 0000000000000..113d3d26be4a7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 20 arguments.
+ */
+public interface UDF20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
new file mode 100644
index 0000000000000..74118f2cf8da7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 21 arguments.
+ */
+public interface UDF21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
new file mode 100644
index 0000000000000..0e7cc40be45ec
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 22 arguments.
+ */
+public interface UDF22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21, T22 t22) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
new file mode 100644
index 0000000000000..6a880f16be47a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 3 arguments.
+ */
+public interface UDF3<T1, T2, T3, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
new file mode 100644
index 0000000000000..fcad2febb18e6
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 4 arguments.
+ */
+public interface UDF4<T1, T2, T3, T4, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
new file mode 100644
index 0000000000000..ce0cef43a2144
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 5 arguments.
+ */
+public interface UDF5<T1, T2, T3, T4, T5, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
new file mode 100644
index 0000000000000..f56b806684e61
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 6 arguments.
+ */
+public interface UDF6<T1, T2, T3, T4, T5, T6, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
new file mode 100644
index 0000000000000..25bd6d3241bd4
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 7 arguments.
+ */
+public interface UDF7<T1, T2, T3, T4, T5, T6, T7, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
new file mode 100644
index 0000000000000..a3b7ac5f94ce7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 8 arguments.
+ */
+public interface UDF8<T1, T2, T3, T4, T5, T6, T7, T8, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
new file mode 100644
index 0000000000000..205e72a1522fc
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 9 arguments.
+ */
+public interface UDF9<T1, T2, T3, T4, T5, T6, T7, T8, T9, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9) throws Exception;
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/package-info.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/package-info.java
similarity index 95%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/package-info.java
rename to sql/core/src/main/java/org/apache/spark/sql/api/java/package-info.java
index 53603614518f5..67007a9f0d1a3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/package-info.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/package-info.java
@@ -18,4 +18,4 @@
 /**
  * Allows the execution of relational queries, including those expressed in SQL using Spark.
  */
-package org.apache.spark.sql;
\ No newline at end of file
+package org.apache.spark.sql.api.java;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 95ed0f28507fc..40bfd55e95a12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -21,45 +21,100 @@ import java.util.Properties
 
 import scala.collection.JavaConverters._
 
+object SQLConf {
+  val COMPRESS_CACHED = "spark.sql.inMemoryColumnarStorage.compressed"
+  val AUTO_BROADCASTJOIN_THRESHOLD = "spark.sql.autoBroadcastJoinThreshold"
+  val DEFAULT_SIZE_IN_BYTES = "spark.sql.defaultSizeInBytes"
+  val AUTO_CONVERT_JOIN_SIZE = "spark.sql.auto.convert.join.size"
+  val SHUFFLE_PARTITIONS = "spark.sql.shuffle.partitions"
+  val JOIN_BROADCAST_TABLES = "spark.sql.join.broadcastTables"
+  val CODEGEN_ENABLED = "spark.sql.codegen"
+  val DIALECT = "spark.sql.dialect"
+
+  object Deprecated {
+    val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
+  }
+}
+
 /**
- * SQLConf holds mutable config parameters and hints.  These can be set and
- * queried either by passing SET commands into Spark SQL's DSL
- * functions (sql(), hql(), etc.), or by programmatically using setters and
- * getters of this class.  This class is thread-safe.
+ * A trait that enables the setting and getting of mutable config parameters/hints.
+ *
+ * In the presence of a SQLContext, these can be set and queried by passing SET commands
+ * into Spark SQL's query functions (i.e. sql()). Otherwise, users of this trait can
+ * modify the hints by programmatically calling the setters and getters of this trait.
+ *
+ * SQLConf is thread-safe (internally synchronized, so safe to be used in multiple threads).
  */
 trait SQLConf {
+  import SQLConf._
+
+  @transient protected[spark] val settings = java.util.Collections.synchronizedMap(
+    new java.util.HashMap[String, String]())
 
   /** ************************ Spark SQL Params/Hints ******************* */
   // TODO: refactor so that these hints accessors don't pollute the name space of SQLContext?
 
+  /**
+   * The SQL dialect that is used when parsing queries.  This defaults to 'sql' which uses
+   * a simple SQL parser provided by Spark SQL.  This is currently the only option for users of
+   * SQLContext.
+   *
+   * When using a HiveContext, this value defaults to 'hiveql', which uses the Hive 0.12.0 HiveQL
+   * parser.  Users can change this to 'sql' if they want to run queries that aren't supported by
+   * HiveQL (e.g., SELECT 1).
+   *
+   * Note that the choice of dialect does not affect things like what tables are available or
+   * how query execution is performed.
+   */
+  private[spark] def dialect: String = get(DIALECT, "sql")
+
+  /** When true tables cached using the in-memory columnar caching will be compressed. */
+  private[spark] def useCompression: Boolean = get(COMPRESS_CACHED, "false").toBoolean
+
   /** Number of partitions to use for shuffle operators. */
-  private[spark] def numShufflePartitions: Int = get("spark.sql.shuffle.partitions", "200").toInt
+  private[spark] def numShufflePartitions: Int = get(SHUFFLE_PARTITIONS, "200").toInt
+
+  /**
+   * When set to true, Spark SQL will use the Scala compiler at runtime to generate custom bytecode
+   * that evaluates expressions found in queries.  In general this custom code runs much faster
+   * than interpreted evaluation, but there are significant start-up costs due to compilation.
+   * As a result codegen is only benificial when queries run for a long time, or when the same
+   * expressions are used multiple times.
+   *
+   * Defaults to false as this feature is currently experimental.
+   */
+  private[spark] def codegenEnabled: Boolean =
+    if (get(CODEGEN_ENABLED, "false") == "true") true else false
 
   /**
    * Upper bound on the sizes (in bytes) of the tables qualified for the auto conversion to
-   * a broadcast value during the physical executions of join operations.  Setting this to 0
+   * a broadcast value during the physical executions of join operations.  Setting this to -1
    * effectively disables auto conversion.
-   * Hive setting: hive.auto.convert.join.noconditionaltask.size.
+   *
+   * Hive setting: hive.auto.convert.join.noconditionaltask.size, whose default value is also 10000.
    */
-  private[spark] def autoConvertJoinSize: Int =
-    get("spark.sql.auto.convert.join.size", "10000").toInt
+  private[spark] def autoBroadcastJoinThreshold: Int =
+    get(AUTO_BROADCASTJOIN_THRESHOLD, "10000").toInt
 
-  /** A comma-separated list of table names marked to be broadcasted during joins. */
-  private[spark] def joinBroadcastTables: String = get("spark.sql.join.broadcastTables", "")
+  /**
+   * The default size in bytes to assign to a logical operator's estimation statistics.  By default,
+   * it is set to a larger value than `autoConvertJoinSize`, hence any logical operator without a
+   * properly implemented estimation of this statistic will not be incorrectly broadcasted in joins.
+   */
+  private[spark] def defaultSizeInBytes: Long =
+    getOption(DEFAULT_SIZE_IN_BYTES).map(_.toLong).getOrElse(autoBroadcastJoinThreshold + 1)
 
   /** ********************** SQLConf functionality methods ************ */
 
-  @transient
-  protected[sql] val settings = java.util.Collections.synchronizedMap(
-    new java.util.HashMap[String, String]())
-
   def set(props: Properties): Unit = {
-    props.asScala.foreach { case (k, v) => this.settings.put(k, v) }
+    settings.synchronized {
+      props.asScala.foreach { case (k, v) => settings.put(k, v) }
+    }
   }
 
   def set(key: String, value: String): Unit = {
     require(key != null, "key cannot be null")
-    require(value != null, s"value cannot be null for ${key}")
+    require(value != null, s"value cannot be null for key: $key")
     settings.put(key, value)
   }
 
@@ -71,11 +126,9 @@ trait SQLConf {
     Option(settings.get(key)).getOrElse(defaultValue)
   }
 
-  def getAll: Array[(String, String)] = settings.asScala.toArray
+  def getAll: Array[(String, String)] = settings.synchronized { settings.asScala.toArray }
 
-  def getOption(key: String): Option[String] = {
-    Option(settings.get(key))
-  }
+  def getOption(key: String): Option[String] = Option(settings.get(key))
 
   def contains(key: String): Boolean = settings.containsKey(key)
 
@@ -88,5 +141,5 @@ trait SQLConf {
   private[spark] def clear() {
     settings.clear()
   }
-
 }
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 7edb548678c33..ecd5fbaa0b094 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -24,11 +24,10 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi, Experimental}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.dsl.ExpressionConversions
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -37,7 +36,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.SparkStrategies
 import org.apache.spark.sql.json._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.SparkContext
+import org.apache.spark.{Logging, SparkContext}
 
 /**
  * :: AlphaComponent ::
@@ -49,18 +48,23 @@ import org.apache.spark.SparkContext
  */
 @AlphaComponent
 class SQLContext(@transient val sparkContext: SparkContext)
-  extends Logging
+  extends org.apache.spark.Logging
   with SQLConf
   with ExpressionConversions
+  with UDFRegistration
   with Serializable {
 
   self =>
 
   @transient
-  protected[sql] lazy val catalog: Catalog = new SimpleCatalog
+  protected[sql] lazy val catalog: Catalog = new SimpleCatalog(true)
+
+  @transient
+  protected[sql] lazy val functionRegistry: FunctionRegistry = new SimpleFunctionRegistry
+
   @transient
   protected[sql] lazy val analyzer: Analyzer =
-    new Analyzer(catalog, EmptyFunctionRegistry, caseSensitive = true)
+    new Analyzer(catalog, functionRegistry, caseSensitive = true)
   @transient
   protected[sql] val optimizer = Optimizer
   @transient
@@ -86,7 +90,45 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]) =
-    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd.fromProductRdd(rdd)))
+    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd.fromProductRdd(rdd))(self))
+
+  /**
+   * :: DeveloperApi ::
+   * Creates a [[SchemaRDD]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD.
+   * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
+   * the provided schema. Otherwise, there will be runtime exception.
+   * Example:
+   * {{{
+   *  import org.apache.spark.sql._
+   *  val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+   *
+   *  val schema =
+   *    StructType(
+   *      StructField("name", StringType, false) ::
+   *      StructField("age", IntegerType, true) :: Nil)
+   *
+   *  val people =
+   *    sc.textFile("examples/src/main/resources/people.txt").map(
+   *      _.split(",")).map(p => Row(p(0), p(1).trim.toInt))
+   *  val peopleSchemaRDD = sqlContext. applySchema(people, schema)
+   *  peopleSchemaRDD.printSchema
+   *  // root
+   *  // |-- name: string (nullable = false)
+   *  // |-- age: integer (nullable = true)
+   *
+   *    peopleSchemaRDD.registerTempTable("people")
+   *  sqlContext.sql("select name from people").collect.foreach(println)
+   * }}}
+   *
+   * @group userf
+   */
+  @DeveloperApi
+  def applySchema(rowRDD: RDD[Row], schema: StructType): SchemaRDD = {
+    // TODO: use MutableProjection when rowRDD is another SchemaRDD and the applied
+    // schema differs from the existing schema on any field data type.
+    val logicalPlan = SparkLogicalPlan(ExistingRdd(schema.toAttributes, rowRDD))(self)
+    new SchemaRDD(this, logicalPlan)
+  }
 
   /**
    * Loads a Parquet file, returning the result as a [[SchemaRDD]].
@@ -94,7 +136,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   def parquetFile(path: String): SchemaRDD =
-    new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration)))
+    new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]].
@@ -104,6 +146,19 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   def jsonFile(path: String): SchemaRDD = jsonFile(path, 1.0)
 
+  /**
+   * :: Experimental ::
+   * Loads a JSON file (one object per line) and applies the given schema,
+   * returning the result as a [[SchemaRDD]].
+   *
+   * @group userf
+   */
+  @Experimental
+  def jsonFile(path: String, schema: StructType): SchemaRDD = {
+    val json = sparkContext.textFile(path)
+    jsonRDD(json, schema)
+  }
+
   /**
    * :: Experimental ::
    */
@@ -124,10 +179,28 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
+   * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
+   * returning the result as a [[SchemaRDD]].
+   *
+   * @group userf
    */
   @Experimental
-  def jsonRDD(json: RDD[String], samplingRatio: Double): SchemaRDD =
-    new SchemaRDD(this, JsonRDD.inferSchema(json, samplingRatio))
+  def jsonRDD(json: RDD[String], schema: StructType): SchemaRDD = {
+    val appliedSchema =
+      Option(schema).getOrElse(JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json, 1.0)))
+    val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema)
+    applySchema(rowRDD, appliedSchema)
+  }
+
+  /**
+   * :: Experimental ::
+   */
+  @Experimental
+  def jsonRDD(json: RDD[String], samplingRatio: Double): SchemaRDD = {
+    val appliedSchema = JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json, samplingRatio))
+    val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema)
+    applySchema(rowRDD, appliedSchema)
+  }
 
   /**
    * :: Experimental ::
@@ -139,7 +212,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *   import sqlContext._
    *
    *   case class Person(name: String, age: Int)
-   *   createParquetFile[Person]("path/to/file.parquet").registerAsTable("people")
+   *   createParquetFile[Person]("path/to/file.parquet").registerTempTable("people")
    *   sql("INSERT INTO people SELECT 'michael', 29")
    * }}}
    *
@@ -160,7 +233,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
       conf: Configuration = new Configuration()): SchemaRDD = {
     new SchemaRDD(
       this,
-      ParquetRelation.createEmpty(path, ScalaReflection.attributesFor[A], allowExisting, conf))
+      ParquetRelation.createEmpty(
+        path, ScalaReflection.attributesFor[A], allowExisting, conf, this))
   }
 
   /**
@@ -170,19 +244,22 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   def registerRDDAsTable(rdd: SchemaRDD, tableName: String): Unit = {
-    val name = tableName
-    val newPlan = rdd.logicalPlan transform {
-      case s @ SparkLogicalPlan(ExistingRdd(_, _), _) => s.copy(tableName = name)
-    }
-    catalog.registerTable(None, tableName, newPlan)
+    catalog.registerTable(None, tableName, rdd.logicalPlan)
   }
 
   /**
-   * Executes a SQL query using Spark, returning the result as a SchemaRDD.
+   * Executes a SQL query using Spark, returning the result as a SchemaRDD.  The dialect that is
+   * used for SQL parsing can be configured with 'spark.sql.dialect'.
    *
    * @group userf
    */
-  def sql(sqlText: String): SchemaRDD = new SchemaRDD(this, parseSql(sqlText))
+  def sql(sqlText: String): SchemaRDD = {
+    if (dialect == "sql") {
+      new SchemaRDD(this, parseSql(sqlText))
+    } else {
+      sys.error(s"Unsupported SQL dialect: $dialect")
+    }
+  }
 
   /** Returns the specified table as a SchemaRDD */
   def table(tableName: String): SchemaRDD =
@@ -196,8 +273,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
         currentTable.logicalPlan
 
       case _ =>
-        val useCompression =
-          sparkContext.conf.getBoolean("spark.sql.inMemoryColumnarStorage.compressed", false)
         InMemoryRelation(useCompression, executePlan(currentTable).executedPlan)
     }
 
@@ -212,7 +287,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       case inMem @ InMemoryRelation(_, _, e: ExistingRdd) =>
         inMem.cachedColumnBuffers.unpersist()
         catalog.unregisterTable(None, tableName)
-        catalog.registerTable(None, tableName, SparkLogicalPlan(e))
+        catalog.registerTable(None, tableName, SparkLogicalPlan(e)(self))
       case inMem: InMemoryRelation =>
         inMem.cachedColumnBuffers.unpersist()
         catalog.unregisterTable(None, tableName)
@@ -234,12 +309,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
     val sqlContext: SQLContext = self
 
+    def codegenEnabled = self.codegenEnabled
+
     def numPartitions = self.numShufflePartitions
 
     val strategies: Seq[Strategy] =
       CommandStrategy(self) ::
       TakeOrdered ::
-      PartialAggregation ::
+      HashAggregation ::
       LeftSemiJoin ::
       HashJoin ::
       InMemoryScans ::
@@ -297,27 +374,30 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[Row], 1)
 
   /**
-   * Prepares a planned SparkPlan for execution by binding references to specific ordinals, and
-   * inserting shuffle operations as needed.
+   * Prepares a planned SparkPlan for execution by inserting shuffle operations as needed.
    */
   @transient
   protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
     val batches =
-      Batch("Add exchange", Once, AddExchange(self)) ::
-      Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil
+      Batch("Add exchange", Once, AddExchange(self)) :: Nil
   }
 
   /**
+   * :: DeveloperApi ::
    * The primary workflow for executing relational queries using Spark.  Designed to allow easy
    * access to the intermediate phases of query execution for developers.
    */
+  @DeveloperApi
   protected abstract class QueryExecution {
     def logical: LogicalPlan
 
-    lazy val analyzed = analyzer(logical)
+    lazy val analyzed = ExtractPythonUdfs(analyzer(logical))
     lazy val optimizedPlan = optimizer(analyzed)
     // TODO: Don't just pick the first one...
-    lazy val sparkPlan = planner(optimizedPlan).next()
+    lazy val sparkPlan = {
+      SparkPlan.currentContext.set(self)
+      planner(optimizedPlan).next()
+    }
     // executedPlan should not be used to initialize any SparkPlan. It should be
     // only used for execution.
     lazy val executedPlan: SparkPlan = prepareForExecution(sparkPlan)
@@ -337,41 +417,99 @@ class SQLContext(@transient val sparkContext: SparkContext)
          |${stringOrError(optimizedPlan)}
          |== Physical Plan ==
          |${stringOrError(executedPlan)}
+         |Code Generation: ${executedPlan.codegenEnabled}
+         |== RDD ==
+         |${stringOrError(toRdd.toDebugString)}
       """.stripMargin.trim
   }
 
   /**
-   * Peek at the first row of the RDD and infer its schema.
-   * TODO: consolidate this with the type system developed in SPARK-2060.
+   * Parses the data type in our internal string representation. The data type string should
+   * have the same format as the one generated by `toString` in scala.
+   * It is only used by PySpark.
+   */
+  private[sql] def parseDataType(dataTypeString: String): DataType = {
+    val parser = org.apache.spark.sql.catalyst.types.DataType
+    parser(dataTypeString)
+  }
+
+  /**
+   * Apply a schema defined by the schemaString to an RDD. It is only used by PySpark.
    */
-  private[sql] def inferSchema(rdd: RDD[Map[String, _]]): SchemaRDD = {
+  private[sql] def applySchemaToPythonRDD(
+      rdd: RDD[Array[Any]],
+      schemaString: String): SchemaRDD = {
+    val schema = parseDataType(schemaString).asInstanceOf[StructType]
+    applySchemaToPythonRDD(rdd, schema)
+  }
+
+  /**
+   * Apply a schema defined by the schema to an RDD. It is only used by PySpark.
+   */
+  private[sql] def applySchemaToPythonRDD(
+      rdd: RDD[Array[Any]],
+      schema: StructType): SchemaRDD = {
     import scala.collection.JavaConversions._
-    def typeFor(obj: Any): DataType = obj match {
-      case c: java.lang.String => StringType
-      case c: java.lang.Integer => IntegerType
-      case c: java.lang.Long => LongType
-      case c: java.lang.Double => DoubleType
-      case c: java.lang.Boolean => BooleanType
-      case c: java.util.List[_] => ArrayType(typeFor(c.head))
-      case c: java.util.Set[_] => ArrayType(typeFor(c.head))
-      case c: java.util.Map[_, _] =>
-        val (key, value) = c.head
-        MapType(typeFor(key), typeFor(value))
-      case c if c.getClass.isArray =>
-        val elem = c.asInstanceOf[Array[_]].head
-        ArrayType(typeFor(elem))
-      case c => throw new Exception(s"Object of type $c cannot be used")
+    import scala.collection.convert.Wrappers.{JListWrapper, JMapWrapper}
+
+    def needsConversion(dataType: DataType): Boolean = dataType match {
+      case ByteType => true
+      case ShortType => true
+      case FloatType => true
+      case TimestampType => true
+      case ArrayType(_, _) => true
+      case MapType(_, _, _) => true
+      case StructType(_) => true
+      case other => false
     }
-    val schema = rdd.first().map { case (fieldName, obj) =>
-      AttributeReference(fieldName, typeFor(obj), true)()
-    }.toSeq
 
-    val rowRdd = rdd.mapPartitions { iter =>
-      iter.map { map =>
-        new GenericRow(map.values.toArray.asInstanceOf[Array[Any]]): Row
-      }
+    // Converts value to the type specified by the data type.
+    // Because Python does not have data types for TimestampType, FloatType, ShortType, and
+    // ByteType, we need to explicitly convert values in columns of these data types to the desired
+    // JVM data types.
+    def convert(obj: Any, dataType: DataType): Any = (obj, dataType) match {
+      // TODO: We should check nullable
+      case (null, _) => null
+
+      case (c: java.util.List[_], ArrayType(elementType, _)) =>
+        val converted = c.map { e => convert(e, elementType)}
+        JListWrapper(converted)
+
+      case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
+        c.asInstanceOf[Array[_]].map(e => convert(e, elementType)): Seq[Any]
+
+      case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) => c.map {
+          case (key, value) => (convert(key, keyType), convert(value, valueType))
+        }.toMap
+
+      case (c, StructType(fields)) if c.getClass.isArray =>
+        new GenericRow(c.asInstanceOf[Array[_]].zip(fields).map {
+          case (e, f) => convert(e, f.dataType)
+        }): Row
+
+      case (c: java.util.Calendar, TimestampType) =>
+        new java.sql.Timestamp(c.getTime().getTime())
+
+      case (c: Int, ByteType) => c.toByte
+      case (c: Int, ShortType) => c.toShort
+      case (c: Double, FloatType) => c.toFloat
+      case (c, StringType) if !c.isInstanceOf[String] => c.toString
+
+      case (c, _) => c
+    }
+
+    val convertedRdd = if (schema.fields.exists(f => needsConversion(f.dataType))) {
+      rdd.map(m => m.zip(schema.fields).map {
+        case (value, field) => convert(value, field.dataType)
+      })
+    } else {
+      rdd
+    }
+
+    val rowRdd = convertedRdd.mapPartitions { iter =>
+      iter.map { m => new GenericRow(m): Row}
     }
-    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd(schema, rowRdd)))
-  }
 
+    new SchemaRDD(this, SparkLogicalPlan(ExistingRdd(schema.toAttributes, rowRdd))(self))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index 8f9f54f610e9d..57df79321b35d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql
 
+import java.util.{Map => JMap, List => JList}
+
+import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+
 import net.razorvine.pickle.Pickler
 
 import org.apache.spark.{Dependency, OneToOneDependency, Partition, Partitioner, TaskContext}
@@ -27,10 +32,8 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
-import org.apache.spark.sql.catalyst.types.{DataType, StructType, BooleanType}
 import org.apache.spark.sql.execution.{ExistingRdd, SparkLogicalPlan}
 import org.apache.spark.api.java.JavaRDD
-import java.util.{Map => JMap}
 
 /**
  * :: AlphaComponent ::
@@ -64,7 +67,7 @@ import java.util.{Map => JMap}
  *  val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
  *  // Any RDD containing case classes can be registered as a table.  The schema of the table is
  *  // automatically inferred using scala reflection.
- *  rdd.registerAsTable("records")
+ *  rdd.registerTempTable("records")
  *
  *  val results: SchemaRDD = sql("SELECT * FROM records")
  * }}}
@@ -116,6 +119,11 @@ class SchemaRDD(
   override protected def getDependencies: Seq[Dependency[_]] =
     List(new OneToOneDependency(queryExecution.toRdd))
 
+  /** Returns the schema of this SchemaRDD (represented by a [[StructType]]).
+    *
+    * @group schema
+    */
+  def schema: StructType = queryExecution.analyzed.schema
 
   // =======================================================================
   // Query DSL
@@ -252,6 +260,26 @@ class SchemaRDD(
   def unionAll(otherPlan: SchemaRDD) =
     new SchemaRDD(sqlContext, Union(logicalPlan, otherPlan.logicalPlan))
 
+  /**
+   * Performs a relational except on two SchemaRDDs
+   *
+   * @param otherPlan the [[SchemaRDD]] that should be excepted from this one.
+   *
+   * @group Query
+   */
+  def except(otherPlan: SchemaRDD): SchemaRDD =
+    new SchemaRDD(sqlContext, Except(logicalPlan, otherPlan.logicalPlan))
+
+  /**
+   * Performs a relational intersect on two SchemaRDDs
+   *
+   * @param otherPlan the [[SchemaRDD]] that should be intersected with this one.
+   *
+   * @group Query
+   */
+  def intersect(otherPlan: SchemaRDD): SchemaRDD =
+    new SchemaRDD(sqlContext, Intersect(logicalPlan, otherPlan.logicalPlan))
+
   /**
    * Filters tuples using a function over the value of the specified column.
    *
@@ -352,37 +380,36 @@ class SchemaRDD(
    * Converts a JavaRDD to a PythonRDD. It is used by pyspark.
    */
   private[sql] def javaToPython: JavaRDD[Array[Byte]] = {
-    def rowToMap(row: Row, structType: StructType): JMap[String, Any] = {
-      val fields = structType.fields.map(field => (field.name, field.dataType))
-      val map: JMap[String, Any] = new java.util.HashMap
-      row.zip(fields).foreach {
-        case (obj, (name, dataType)) =>
-          dataType match {
-            case struct: StructType => map.put(name, rowToMap(obj.asInstanceOf[Row], struct))
-            case other => map.put(name, obj)
-          }
+    import scala.collection.Map
+
+    def toJava(obj: Any, dataType: DataType): Any = dataType match {
+      case struct: StructType => rowToArray(obj.asInstanceOf[Row], struct)
+      case array: ArrayType => obj match {
+        case seq: Seq[Any] => seq.map(x => toJava(x, array.elementType)).asJava
+        case list: JList[_] => list.map(x => toJava(x, array.elementType)).asJava
+        case arr if arr != null && arr.getClass.isArray =>
+          arr.asInstanceOf[Array[Any]].map(x => toJava(x, array.elementType))
+        case other => other
       }
-
-      map
+      case mt: MapType => obj.asInstanceOf[Map[_, _]].map {
+        case (k, v) => (k, toJava(v, mt.valueType)) // key should be primitive type
+      }.asJava
+      // Pyrolite can handle Timestamp
+      case other => obj
+    }
+    def rowToArray(row: Row, structType: StructType): Array[Any] = {
+      val fields = structType.fields.map(field => field.dataType)
+      row.zip(fields).map {
+        case (obj, dataType) => toJava(obj, dataType)
+      }.toArray
     }
 
-    // TODO: Actually, the schema of a row should be represented by a StructType instead of
-    // a Seq[Attribute]. Once we have finished that change, we can just use rowToMap to
-    // construct the Map for python.
-    val fields: Seq[(String, DataType)] = this.queryExecution.analyzed.output.map(
-      field => (field.name, field.dataType))
+    val rowSchema = StructType.fromAttributes(this.queryExecution.analyzed.output)
     this.mapPartitions { iter =>
       val pickle = new Pickler
       iter.map { row =>
-        val map: JMap[String, Any] = new java.util.HashMap
-        row.zip(fields).foreach { case (obj, (name, dataType)) =>
-          dataType match {
-            case struct: StructType => map.put(name, rowToMap(obj.asInstanceOf[Row], struct))
-            case other => map.put(name, obj)
-          }
-        }
-        map
-      }.grouped(10).map(batched => pickle.dumps(batched.toArray))
+        rowToArray(row, rowSchema)
+      }.grouped(100).map(batched => pickle.dumps(batched.toArray))
     }
   }
 
@@ -395,7 +422,8 @@ class SchemaRDD(
    * @group schema
    */
   private def applySchema(rdd: RDD[Row]): SchemaRDD = {
-    new SchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(logicalPlan.output, rdd)))
+    new SchemaRDD(sqlContext,
+      SparkLogicalPlan(ExistingRdd(queryExecution.analyzed.output, rdd))(sqlContext))
   }
 
   // =======================================================================
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index fe81721943202..2f3033a5f94f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -56,7 +56,7 @@ private[sql] trait SchemaRDDLike {
     // happen right away to let these side effects take place eagerly.
     case _: Command | _: InsertIntoTable | _: InsertIntoCreatedTable | _: WriteToFile =>
       queryExecution.toRdd
-      SparkLogicalPlan(queryExecution.executedPlan)
+      SparkLogicalPlan(queryExecution.executedPlan)(sqlContext)
     case _ =>
       baseLogicalPlan
   }
@@ -83,10 +83,13 @@ private[sql] trait SchemaRDDLike {
    *
    * @group schema
    */
-  def registerAsTable(tableName: String): Unit = {
+  def registerTempTable(tableName: String): Unit = {
     sqlContext.registerRDDAsTable(baseSchemaRDD, tableName)
   }
 
+  @deprecated("Use registerTempTable instead of registerAsTable.", "1.1")
+  def registerAsTable(tableName: String): Unit = registerTempTable(tableName)
+
   /**
    * :: Experimental ::
    * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
@@ -123,9 +126,15 @@ private[sql] trait SchemaRDDLike {
   def saveAsTable(tableName: String): Unit =
     sqlContext.executePlan(InsertIntoCreatedTable(None, tableName, logicalPlan)).toRdd
 
-  /** Returns the output schema in the tree format. */
-  def schemaString: String = queryExecution.analyzed.schemaString
+  /** Returns the schema as a string in the tree format.
+   *
+   * @group schema
+   */
+  def schemaString: String = baseSchemaRDD.schema.treeString
 
-  /** Prints out the schema in the tree format. */
+  /** Prints out the schema.
+   *
+   * @group schema
+   */
   def printSchema(): Unit = println(schemaString)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
new file mode 100644
index 0000000000000..0b48e9e659faa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.{List => JList, Map => JMap}
+
+import org.apache.spark.Accumulator
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
+import org.apache.spark.sql.execution.PythonUDF
+
+import scala.reflect.runtime.universe.{TypeTag, typeTag}
+
+/**
+ * Functions for registering scala lambda functions as UDFs in a SQLContext.
+ */
+protected[sql] trait UDFRegistration {
+  self: SQLContext =>
+
+  private[spark] def registerPython(
+      name: String,
+      command: Array[Byte],
+      envVars: JMap[String, String],
+      pythonIncludes: JList[String],
+      pythonExec: String,
+      accumulator: Accumulator[JList[Array[Byte]]],
+      stringDataType: String): Unit = {
+    log.debug(
+      s"""
+        | Registering new PythonUDF:
+        | name: $name
+        | command: ${command.toSeq}
+        | envVars: $envVars
+        | pythonIncludes: $pythonIncludes
+        | pythonExec: $pythonExec
+        | dataType: $stringDataType
+      """.stripMargin)
+
+
+    val dataType = parseDataType(stringDataType)
+
+    def builder(e: Seq[Expression]) =
+      PythonUDF(
+        name,
+        command,
+        envVars,
+        pythonIncludes,
+        pythonExec,
+        accumulator,
+        dataType,
+        e)
+
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  /** registerFunction 1-22 were generated by this script
+
+    (1 to 22).map { x =>
+      val types = (1 to x).map(x => "_").reduce(_ + ", " + _)
+      s"""
+        def registerFunction[T: TypeTag](name: String, func: Function$x[$types, T]): Unit = {
+          def builder(e: Seq[Expression]) =
+            ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+          functionRegistry.registerFunction(name, builder)
+        }
+      """
+    }
+  */
+
+  // scalastyle:off
+  def registerFunction[T: TypeTag](name: String, func: Function1[_, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function2[_, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function3[_, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function4[_, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function5[_, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function6[_, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function7[_, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function8[_, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function9[_, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function10[_, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+  // scalastyle:on
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
index 790d9ef22cf16..150ff8a42063d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
@@ -21,28 +21,36 @@ import java.beans.Introspector
 
 import org.apache.hadoop.conf.Configuration
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.sql.json.JsonRDD
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.{SQLContext, StructType => SStructType}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRow, Row => ScalaRow}
-import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.execution.{ExistingRdd, SparkLogicalPlan}
+import org.apache.spark.sql.types.util.DataTypeConversions.asScalaDataType
 import org.apache.spark.util.Utils
 
 /**
  * The entry point for executing Spark SQL queries from a Java program.
  */
-class JavaSQLContext(val sqlContext: SQLContext) {
+class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration {
 
   def this(sparkContext: JavaSparkContext) = this(new SQLContext(sparkContext.sc))
 
   /**
-   * Executes a query expressed in SQL, returning the result as a JavaSchemaRDD
+   * Executes a SQL query using Spark, returning the result as a SchemaRDD.  The dialect that is
+   * used for SQL parsing can be configured with 'spark.sql.dialect'.
+   *
+   * @group userf
    */
-  def sql(sqlQuery: String): JavaSchemaRDD =
-    new JavaSchemaRDD(sqlContext, sqlContext.parseSql(sqlQuery))
+  def sql(sqlText: String): JavaSchemaRDD = {
+    if (sqlContext.dialect == "sql") {
+      new JavaSchemaRDD(sqlContext, sqlContext.parseSql(sqlText))
+    } else {
+      sys.error(s"Unsupported SQL dialect: $sqlContext.dialect")
+    }
+  }
 
   /**
    * :: Experimental ::
@@ -52,7 +60,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
    * {{{
    *   JavaSQLContext sqlCtx = new JavaSQLContext(...)
    *
-   *   sqlCtx.createParquetFile(Person.class, "path/to/file.parquet").registerAsTable("people")
+   *   sqlCtx.createParquetFile(Person.class, "path/to/file.parquet").registerTempTable("people")
    *   sqlCtx.sql("INSERT INTO people SELECT 'michael', 29")
    * }}}
    *
@@ -72,7 +80,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
       conf: Configuration = new Configuration()): JavaSchemaRDD = {
     new JavaSchemaRDD(
       sqlContext,
-      ParquetRelation.createEmpty(path, getSchema(beanClass), allowExisting, conf))
+      ParquetRelation.createEmpty(path, getSchema(beanClass), allowExisting, conf, sqlContext))
   }
 
   /**
@@ -92,7 +100,22 @@ class JavaSQLContext(val sqlContext: SQLContext) {
         new GenericRow(extractors.map(e => e.invoke(row)).toArray[Any]): ScalaRow
       }
     }
-    new JavaSchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(schema, rowRdd)))
+    new JavaSchemaRDD(sqlContext, SparkLogicalPlan(ExistingRdd(schema, rowRdd))(sqlContext))
+  }
+
+  /**
+   * :: DeveloperApi ::
+   * Creates a JavaSchemaRDD from an RDD containing Rows by applying a schema to this RDD.
+   * It is important to make sure that the structure of every Row of the provided RDD matches the
+   * provided schema. Otherwise, there will be runtime exception.
+   */
+  @DeveloperApi
+  def applySchema(rowRDD: JavaRDD[Row], schema: StructType): JavaSchemaRDD = {
+    val scalaRowRDD = rowRDD.rdd.map(r => r.row)
+    val scalaSchema = asScalaDataType(schema).asInstanceOf[SStructType]
+    val logicalPlan =
+      SparkLogicalPlan(ExistingRdd(scalaSchema.toAttributes, scalaRowRDD))(sqlContext)
+    new JavaSchemaRDD(sqlContext, logicalPlan)
   }
 
   /**
@@ -101,26 +124,52 @@ class JavaSQLContext(val sqlContext: SQLContext) {
   def parquetFile(path: String): JavaSchemaRDD =
     new JavaSchemaRDD(
       sqlContext,
-      ParquetRelation(path, Some(sqlContext.sparkContext.hadoopConfiguration)))
+      ParquetRelation(path, Some(sqlContext.sparkContext.hadoopConfiguration), sqlContext))
 
   /**
-   * Loads a JSON file (one object per line), returning the result as a [[JavaSchemaRDD]].
+   * Loads a JSON file (one object per line), returning the result as a JavaSchemaRDD.
    * It goes through the entire dataset once to determine the schema.
-   *
-   * @group userf
    */
   def jsonFile(path: String): JavaSchemaRDD =
     jsonRDD(sqlContext.sparkContext.textFile(path))
 
+  /**
+   * :: Experimental ::
+   * Loads a JSON file (one object per line) and applies the given schema,
+   * returning the result as a JavaSchemaRDD.
+   */
+  @Experimental
+  def jsonFile(path: String, schema: StructType): JavaSchemaRDD =
+    jsonRDD(sqlContext.sparkContext.textFile(path), schema)
+
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[JavaSchemaRDD]].
+   * JavaSchemaRDD.
    * It goes through the entire dataset once to determine the schema.
-   *
-   * @group userf
    */
-  def jsonRDD(json: JavaRDD[String]): JavaSchemaRDD =
-    new JavaSchemaRDD(sqlContext, JsonRDD.inferSchema(json, 1.0))
+  def jsonRDD(json: JavaRDD[String]): JavaSchemaRDD = {
+    val appliedScalaSchema = JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json.rdd, 1.0))
+    val scalaRowRDD = JsonRDD.jsonStringToRow(json.rdd, appliedScalaSchema)
+    val logicalPlan =
+      SparkLogicalPlan(ExistingRdd(appliedScalaSchema.toAttributes, scalaRowRDD))(sqlContext)
+    new JavaSchemaRDD(sqlContext, logicalPlan)
+  }
+
+  /**
+   * :: Experimental ::
+   * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
+   * returning the result as a JavaSchemaRDD.
+   */
+  @Experimental
+  def jsonRDD(json: JavaRDD[String], schema: StructType): JavaSchemaRDD = {
+    val appliedScalaSchema =
+      Option(asScalaDataType(schema)).getOrElse(
+        JsonRDD.nullTypeToStringType(JsonRDD.inferSchema(json.rdd, 1.0))).asInstanceOf[SStructType]
+    val scalaRowRDD = JsonRDD.jsonStringToRow(json.rdd, appliedScalaSchema)
+    val logicalPlan =
+      SparkLogicalPlan(ExistingRdd(appliedScalaSchema.toAttributes, scalaRowRDD))(sqlContext)
+    new JavaSchemaRDD(sqlContext, logicalPlan)
+  }
 
   /**
    * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
@@ -138,22 +187,37 @@ class JavaSQLContext(val sqlContext: SQLContext) {
     val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
     fields.map { property =>
       val (dataType, nullable) = property.getPropertyType match {
-        case c: Class[_] if c == classOf[java.lang.String] => (StringType, true)
-        case c: Class[_] if c == java.lang.Short.TYPE => (ShortType, false)
-        case c: Class[_] if c == java.lang.Integer.TYPE => (IntegerType, false)
-        case c: Class[_] if c == java.lang.Long.TYPE => (LongType, false)
-        case c: Class[_] if c == java.lang.Double.TYPE => (DoubleType, false)
-        case c: Class[_] if c == java.lang.Byte.TYPE => (ByteType, false)
-        case c: Class[_] if c == java.lang.Float.TYPE => (FloatType, false)
-        case c: Class[_] if c == java.lang.Boolean.TYPE => (BooleanType, false)
-
-        case c: Class[_] if c == classOf[java.lang.Short] => (ShortType, true)
-        case c: Class[_] if c == classOf[java.lang.Integer] => (IntegerType, true)
-        case c: Class[_] if c == classOf[java.lang.Long] => (LongType, true)
-        case c: Class[_] if c == classOf[java.lang.Double] => (DoubleType, true)
-        case c: Class[_] if c == classOf[java.lang.Byte] => (ByteType, true)
-        case c: Class[_] if c == classOf[java.lang.Float] => (FloatType, true)
-        case c: Class[_] if c == classOf[java.lang.Boolean] => (BooleanType, true)
+        case c: Class[_] if c == classOf[java.lang.String] =>
+          (org.apache.spark.sql.StringType, true)
+        case c: Class[_] if c == java.lang.Short.TYPE =>
+          (org.apache.spark.sql.ShortType, false)
+        case c: Class[_] if c == java.lang.Integer.TYPE =>
+          (org.apache.spark.sql.IntegerType, false)
+        case c: Class[_] if c == java.lang.Long.TYPE =>
+          (org.apache.spark.sql.LongType, false)
+        case c: Class[_] if c == java.lang.Double.TYPE =>
+          (org.apache.spark.sql.DoubleType, false)
+        case c: Class[_] if c == java.lang.Byte.TYPE =>
+          (org.apache.spark.sql.ByteType, false)
+        case c: Class[_] if c == java.lang.Float.TYPE =>
+          (org.apache.spark.sql.FloatType, false)
+        case c: Class[_] if c == java.lang.Boolean.TYPE =>
+          (org.apache.spark.sql.BooleanType, false)
+
+        case c: Class[_] if c == classOf[java.lang.Short] =>
+          (org.apache.spark.sql.ShortType, true)
+        case c: Class[_] if c == classOf[java.lang.Integer] =>
+          (org.apache.spark.sql.IntegerType, true)
+        case c: Class[_] if c == classOf[java.lang.Long] =>
+          (org.apache.spark.sql.LongType, true)
+        case c: Class[_] if c == classOf[java.lang.Double] =>
+          (org.apache.spark.sql.DoubleType, true)
+        case c: Class[_] if c == classOf[java.lang.Byte] =>
+          (org.apache.spark.sql.ByteType, true)
+        case c: Class[_] if c == classOf[java.lang.Float] =>
+          (org.apache.spark.sql.FloatType, true)
+        case c: Class[_] if c == classOf[java.lang.Boolean] =>
+          (org.apache.spark.sql.BooleanType, true)
       }
       AttributeReference(property.getName, dataType, nullable)()
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
index aff6ffe9f3478..4d799b4038fdd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
@@ -17,11 +17,15 @@
 
 package org.apache.spark.sql.api.java
 
+import java.util.{List => JList}
+
 import org.apache.spark.Partitioner
 import org.apache.spark.api.java.{JavaRDDLike, JavaRDD}
 import org.apache.spark.api.java.function.{Function => JFunction}
+import org.apache.spark.sql.types.util.DataTypeConversions
 import org.apache.spark.sql.{SQLContext, SchemaRDD, SchemaRDDLike}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import DataTypeConversions._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
@@ -51,6 +55,10 @@ class JavaSchemaRDD(
 
   override def toString: String = baseSchemaRDD.toString
 
+  /** Returns the schema of this JavaSchemaRDD (represented by a StructType). */
+  def schema: StructType =
+    asJavaDataType(baseSchemaRDD.schema).asInstanceOf[StructType]
+
   // =======================================================================
   // Base RDD functions that do NOT change schema
   // =======================================================================
@@ -96,6 +104,20 @@ class JavaSchemaRDD(
     this
   }
 
+  // Overridden actions from JavaRDDLike.
+
+  override def collect(): JList[Row] = {
+    import scala.collection.JavaConversions._
+    val arr: java.util.Collection[Row] = baseSchemaRDD.collect().toSeq.map(new Row(_))
+    new java.util.ArrayList(arr)
+  }
+
+  override def take(num: Int): JList[Row] = {
+    import scala.collection.JavaConversions._
+    val arr: java.util.Collection[Row] = baseSchemaRDD.take(num).toSeq.map(new Row(_))
+    new java.util.ArrayList(arr)
+  }
+
   // Transformations (return a new RDD)
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
index 9b0dd2176149b..6c67934bda5b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql.api.java
 
+import scala.annotation.varargs
+import scala.collection.convert.Wrappers.{JListWrapper, JMapWrapper}
+import scala.collection.JavaConversions
+import scala.math.BigDecimal
+
 import org.apache.spark.sql.catalyst.expressions.{Row => ScalaRow}
 
 /**
@@ -29,7 +34,7 @@ class Row(private[spark] val row: ScalaRow) extends Serializable {
 
   /** Returns the value of column `i`. */
   def get(i: Int): Any =
-    row(i)
+    Row.toJavaValue(row(i))
 
   /** Returns true if value at column `i` is NULL. */
   def isNullAt(i: Int) = get(i) == null
@@ -89,5 +94,57 @@ class Row(private[spark] val row: ScalaRow) extends Serializable {
    */
   def getString(i: Int): String =
     row.getString(i)
+
+  def canEqual(other: Any): Boolean = other.isInstanceOf[Row]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: Row =>
+      (that canEqual this) &&
+        row == that.row
+    case _ => false
+  }
+
+  override def hashCode(): Int = row.hashCode()
 }
 
+object Row {
+
+  private def toJavaValue(value: Any): Any = value match {
+    // For values of this ScalaRow, we will do the conversion when
+    // they are actually accessed.
+    case row: ScalaRow => new Row(row)
+    case map: scala.collection.Map[_, _] =>
+      JavaConversions.mapAsJavaMap(
+        map.map {
+          case (key, value) => (toJavaValue(key), toJavaValue(value))
+        }
+      )
+    case seq: scala.collection.Seq[_] =>
+      JavaConversions.seqAsJavaList(seq.map(toJavaValue))
+    case decimal: BigDecimal => decimal.underlying()
+    case other => other
+  }
+
+  // TODO: Consolidate the toScalaValue at here with the scalafy in JsonRDD?
+  private def toScalaValue(value: Any): Any = value match {
+    // Values of this row have been converted to Scala values.
+    case row: Row => row.row
+    case map: java.util.Map[_, _] =>
+      JMapWrapper(map).map {
+        case (key, value) => (toScalaValue(key), toScalaValue(value))
+      }
+    case list: java.util.List[_] =>
+      JListWrapper(list).map(toScalaValue)
+    case decimal: java.math.BigDecimal => BigDecimal(decimal)
+    case other => other
+  }
+
+  /**
+   * Creates a Row with the given values.
+   */
+  @varargs def create(values: Any*): Row = {
+    // Right now, we cannot use @varargs to annotate the constructor of
+    // org.apache.spark.sql.api.java.Row. See https://issues.scala-lang.org/browse/SI-8383.
+    new Row(ScalaRow(values.map(toScalaValue):_*))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
new file mode 100644
index 0000000000000..158f26e3d445f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
@@ -0,0 +1,252 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.api.java
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
+import org.apache.spark.sql.types.util.DataTypeConversions._
+
+/**
+ * A collection of functions that allow Java users to register UDFs.  In order to handle functions
+ * of varying airities with minimal boilerplate for our users, we generate classes and functions
+ * for each airity up to 22.  The code for this generation can be found in comments in this trait.
+ */
+private[java] trait UDFRegistration {
+  self: JavaSQLContext =>
+
+  /* The following functions and required interfaces are generated with these code fragments:
+
+   (1 to 22).foreach { i =>
+     val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
+     val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
+     val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
+     val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+     println(s"""
+         |def registerFunction(
+         |    name: String, f: UDF$i[$extTypeArgs, _], @transient dataType: DataType) = {
+         |  val scalaType = asScalaDataType(dataType)
+         |  sqlContext.functionRegistry.registerFunction(
+         |    name,
+         |    (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), scalaType, e))
+         |}
+       """.stripMargin)
+   }
+
+  import java.io.File
+  import org.apache.spark.sql.catalyst.util.stringToFile
+  val directory = new File("sql/core/src/main/java/org/apache/spark/sql/api/java/")
+  (1 to 22).foreach { i =>
+    val typeArgs = (1 to i).map(i => s"T$i").mkString(", ")
+    val args = (1 to i).map(i => s"T$i t$i").mkString(", ")
+
+    val contents =
+      s"""/*
+         | * Licensed to the Apache Software Foundation (ASF) under one or more
+         | * contributor license agreements.  See the NOTICE file distributed with
+         | * this work for additional information regarding copyright ownership.
+         | * The ASF licenses this file to You under the Apache License, Version 2.0
+         | * (the "License"); you may not use this file except in compliance with
+         | * the License.  You may obtain a copy of the License at
+         | *
+         | *    http://www.apache.org/licenses/LICENSE-2.0
+         | *
+         | * Unless required by applicable law or agreed to in writing, software
+         | * distributed under the License is distributed on an "AS IS" BASIS,
+         | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+         | * See the License for the specific language governing permissions and
+         | * limitations under the License.
+         | */
+         |
+         |package org.apache.spark.sql.api.java;
+         |
+         |import java.io.Serializable;
+         |
+         |// **************************************************
+         |// THIS FILE IS AUTOGENERATED BY CODE IN
+         |// org.apache.spark.sql.api.java.FunctionRegistration
+         |// **************************************************
+         |
+         |/**
+         | * A Spark SQL UDF that has $i arguments.
+         | */
+         |public interface UDF$i<$typeArgs, R> extends Serializable {
+         |  public R call($args) throws Exception;
+         |}
+         |""".stripMargin
+
+      stringToFile(new File(directory, s"UDF$i.java"), contents)
+  }
+
+  */
+
+  // scalastyle:off
+  def registerFunction(name: String, f: UDF1[_, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF2[_, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF3[_, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF4[_, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF5[_, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF6[_, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF7[_, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  // scalastyle:on
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 3c39e1d350fa8..42a5a9a84f362 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -90,6 +90,9 @@ private[sql] class FloatColumnAccessor(buffer: ByteBuffer)
 private[sql] class StringColumnAccessor(buffer: ByteBuffer)
   extends NativeColumnAccessor(buffer, STRING)
 
+private[sql] class TimestampColumnAccessor(buffer: ByteBuffer)
+  extends NativeColumnAccessor(buffer, TIMESTAMP)
+
 private[sql] class BinaryColumnAccessor(buffer: ByteBuffer)
   extends BasicColumnAccessor[BinaryType.type, Array[Byte]](buffer, BINARY)
   with NullableColumnAccessor
@@ -105,16 +108,17 @@ private[sql] object ColumnAccessor {
     val columnTypeId = dup.getInt()
 
     columnTypeId match {
-      case INT.typeId     => new IntColumnAccessor(dup)
-      case LONG.typeId    => new LongColumnAccessor(dup)
-      case FLOAT.typeId   => new FloatColumnAccessor(dup)
-      case DOUBLE.typeId  => new DoubleColumnAccessor(dup)
-      case BOOLEAN.typeId => new BooleanColumnAccessor(dup)
-      case BYTE.typeId    => new ByteColumnAccessor(dup)
-      case SHORT.typeId   => new ShortColumnAccessor(dup)
-      case STRING.typeId  => new StringColumnAccessor(dup)
-      case BINARY.typeId  => new BinaryColumnAccessor(dup)
-      case GENERIC.typeId => new GenericColumnAccessor(dup)
+      case INT.typeId       => new IntColumnAccessor(dup)
+      case LONG.typeId      => new LongColumnAccessor(dup)
+      case FLOAT.typeId     => new FloatColumnAccessor(dup)
+      case DOUBLE.typeId    => new DoubleColumnAccessor(dup)
+      case BOOLEAN.typeId   => new BooleanColumnAccessor(dup)
+      case BYTE.typeId      => new ByteColumnAccessor(dup)
+      case SHORT.typeId     => new ShortColumnAccessor(dup)
+      case STRING.typeId    => new StringColumnAccessor(dup)
+      case TIMESTAMP.typeId => new TimestampColumnAccessor(dup)
+      case BINARY.typeId    => new BinaryColumnAccessor(dup)
+      case GENERIC.typeId   => new GenericColumnAccessor(dup)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index 4be048cd742d6..c416a745739b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -109,6 +109,9 @@ private[sql] class FloatColumnBuilder extends NativeColumnBuilder(new FloatColum
 
 private[sql] class StringColumnBuilder extends NativeColumnBuilder(new StringColumnStats, STRING)
 
+private[sql] class TimestampColumnBuilder
+  extends NativeColumnBuilder(new TimestampColumnStats, TIMESTAMP)
+
 private[sql] class BinaryColumnBuilder extends ComplexColumnBuilder(BINARY)
 
 // TODO (lian) Add support for array, struct and map
@@ -151,6 +154,7 @@ private[sql] object ColumnBuilder {
       case STRING.typeId  => new StringColumnBuilder
       case BINARY.typeId  => new BinaryColumnBuilder
       case GENERIC.typeId => new GenericColumnBuilder
+      case TIMESTAMP.typeId => new TimestampColumnBuilder
     }).asInstanceOf[ColumnBuilder]
 
     builder.initialize(initialSize, columnName, useCompression)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 95602d321dc6f..6502110e903fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -344,21 +344,52 @@ private[sql] class StringColumnStats extends BasicColumnStats(STRING) {
   }
 
   override def contains(row: Row, ordinal: Int) = {
-    !(upperBound eq null) && {
+    (upperBound ne null) && {
       val field = columnType.getField(row, ordinal)
       lowerBound.compareTo(field) <= 0 && field.compareTo(upperBound) <= 0
     }
   }
 
   override def isAbove(row: Row, ordinal: Int) = {
-    !(upperBound eq null) && {
+    (upperBound ne null) && {
       val field = columnType.getField(row, ordinal)
       field.compareTo(upperBound) < 0
     }
   }
 
   override def isBelow(row: Row, ordinal: Int) = {
-    !(lowerBound eq null) && {
+    (lowerBound ne null) && {
+      val field = columnType.getField(row, ordinal)
+      lowerBound.compareTo(field) < 0
+    }
+  }
+}
+
+private[sql] class TimestampColumnStats extends BasicColumnStats(TIMESTAMP) {
+  override def initialBounds = (null, null)
+
+  override def gatherStats(row: Row, ordinal: Int) {
+    val field = columnType.getField(row, ordinal)
+    if ((upperBound eq null) || field.compareTo(upperBound) > 0) _upper = field
+    if ((lowerBound eq null) || field.compareTo(lowerBound) < 0) _lower = field
+  }
+
+  override def contains(row: Row, ordinal: Int) = {
+    (upperBound ne null) && {
+      val field = columnType.getField(row, ordinal)
+      lowerBound.compareTo(field) <= 0 && field.compareTo(upperBound) <= 0
+    }
+  }
+
+  override def isAbove(row: Row, ordinal: Int) = {
+    (lowerBound ne null) && {
+      val field = columnType.getField(row, ordinal)
+      field.compareTo(upperBound) < 0
+    }
+  }
+
+  override def isBelow(row: Row, ordinal: Int) = {
+    (lowerBound ne null) && {
       val field = columnType.getField(row, ordinal)
       lowerBound.compareTo(field) < 0
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 4cd52d8288137..794bc60d0e315 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -21,6 +21,8 @@ import java.nio.ByteBuffer
 
 import scala.reflect.runtime.universe.TypeTag
 
+import java.sql.Timestamp
+
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.MutableRow
 import org.apache.spark.sql.catalyst.types._
@@ -221,6 +223,26 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
   override def getField(row: Row, ordinal: Int) = row.getString(ordinal)
 }
 
+private[sql] object TIMESTAMP extends NativeColumnType(TimestampType, 8, 12) {
+  override def extract(buffer: ByteBuffer) = {
+    val timestamp = new Timestamp(buffer.getLong())
+    timestamp.setNanos(buffer.getInt())
+    timestamp
+  }
+
+  override def append(v: Timestamp, buffer: ByteBuffer) {
+    buffer.putLong(v.getTime).putInt(v.getNanos)
+  }
+
+  override def getField(row: Row, ordinal: Int) = {
+    row(ordinal).asInstanceOf[Timestamp]
+  }
+
+  override def setField(row: MutableRow, ordinal: Int, value: Timestamp) {
+    row(ordinal) = value
+  }
+}
+
 private[sql] sealed abstract class ByteArrayColumnType[T <: DataType](
     typeId: Int,
     defaultSize: Int)
@@ -240,7 +262,7 @@ private[sql] sealed abstract class ByteArrayColumnType[T <: DataType](
   }
 }
 
-private[sql] object BINARY extends ByteArrayColumnType[BinaryType.type](8, 16) {
+private[sql] object BINARY extends ByteArrayColumnType[BinaryType.type](9, 16) {
   override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]) {
     row(ordinal) = value
   }
@@ -251,7 +273,7 @@ private[sql] object BINARY extends ByteArrayColumnType[BinaryType.type](8, 16) {
 // Used to process generic objects (all types other than those listed above). Objects should be
 // serialized first before appending to the column `ByteBuffer`, and is also extracted as serialized
 // byte array.
-private[sql] object GENERIC extends ByteArrayColumnType[DataType](9, 16) {
+private[sql] object GENERIC extends ByteArrayColumnType[DataType](10, 16) {
   override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]) {
     row(ordinal) = SparkSqlSerializer.deserialize[Any](value)
   }
@@ -262,16 +284,17 @@ private[sql] object GENERIC extends ByteArrayColumnType[DataType](9, 16) {
 private[sql] object ColumnType {
   def apply(dataType: DataType): ColumnType[_, _] = {
     dataType match {
-      case IntegerType => INT
-      case LongType    => LONG
-      case FloatType   => FLOAT
-      case DoubleType  => DOUBLE
-      case BooleanType => BOOLEAN
-      case ByteType    => BYTE
-      case ShortType   => SHORT
-      case StringType  => STRING
-      case BinaryType  => BINARY
-      case _           => GENERIC
+      case IntegerType   => INT
+      case LongType      => LONG
+      case FloatType     => FLOAT
+      case DoubleType    => DOUBLE
+      case BooleanType   => BOOLEAN
+      case ByteType      => BYTE
+      case ShortType     => SHORT
+      case StringType    => STRING
+      case BinaryType    => BINARY
+      case TimestampType => TIMESTAMP
+      case _             => GENERIC
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index e1e4f24c6c66c..88901debbb4e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.columnar
 
+import java.nio.ByteBuffer
+
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Attribute}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -26,22 +29,19 @@ import org.apache.spark.SparkConf
 
 object InMemoryRelation {
   def apply(useCompression: Boolean, child: SparkPlan): InMemoryRelation =
-    new InMemoryRelation(child.output, useCompression, child)
+    new InMemoryRelation(child.output, useCompression, child)()
 }
 
 private[sql] case class InMemoryRelation(
     output: Seq[Attribute],
     useCompression: Boolean,
     child: SparkPlan)
+    (private var _cachedColumnBuffers: RDD[Array[ByteBuffer]] = null)
   extends LogicalPlan with MultiInstanceRelation {
 
-  override def children = Seq.empty
-  override def references = Set.empty
-
-  override def newInstance() =
-    new InMemoryRelation(output.map(_.newInstance), useCompression, child).asInstanceOf[this.type]
-
-  lazy val cachedColumnBuffers = {
+  // If the cached column buffers were not passed in, we calculate them in the constructor.
+  // As in Spark, the actual work of caching is lazy.
+  if (_cachedColumnBuffers == null) {
     val output = child.output
     val cached = child.execute().mapPartitions { iterator =>
       val columnBuilders = output.map { attribute =>
@@ -62,10 +62,23 @@ private[sql] case class InMemoryRelation(
     }.cache()
 
     cached.setName(child.toString)
-    // Force the materialization of the cached RDD.
-    cached.count()
-    cached
+    _cachedColumnBuffers = cached
+  }
+
+
+  override def children = Seq.empty
+
+  override def references = Set.empty
+
+  override def newInstance() = {
+    new InMemoryRelation(
+      output.map(_.newInstance),
+      useCompression,
+      child)(
+      _cachedColumnBuffers).asInstanceOf[this.type]
   }
+
+  def cachedColumnBuffers = _cachedColumnBuffers
 }
 
 private[sql] case class InMemoryColumnarTableScan(
@@ -83,7 +96,11 @@ private[sql] case class InMemoryColumnarTableScan(
       new Iterator[Row] {
         // Find the ordinals of the requested columns.  If none are requested, use the first.
         val requestedColumns =
-          if (attributes.isEmpty) Seq(0) else attributes.map(relation.output.indexOf(_))
+          if (attributes.isEmpty) {
+            Seq(0)
+          } else {
+            attributes.map(a => relation.output.indexWhere(_.exprId == a.exprId))
+          }
 
         val columnAccessors = requestedColumns.map(columnBuffers(_)).map(ColumnAccessor(_))
         val nextRow = new GenericMutableRow(columnAccessors.length)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
index 4c6675c3c87bf..6ad12a0dcb64d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql.columnar.compression
 
 import java.nio.{ByteBuffer, ByteOrder}
 
-import org.apache.spark.sql.{Logging, Row}
+import org.apache.spark.Logging
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder}
 
@@ -101,7 +102,7 @@ private[sql] trait CompressibleColumnBuilder[T <: NativeType]
 
     copyColumnHeader(rawBuffer, compressedBuffer)
 
-    logger.info(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
+    logInfo(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
     encoder.compress(rawBuffer, compressedBuffer, columnType)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index c1ced8bfa404a..463a1d32d7fd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -42,8 +42,8 @@ case class Aggregate(
     partial: Boolean,
     groupingExpressions: Seq[Expression],
     aggregateExpressions: Seq[NamedExpression],
-    child: SparkPlan)(@transient sqlContext: SQLContext)
-  extends UnaryNode with NoBind {
+    child: SparkPlan)
+  extends UnaryNode {
 
   override def requiredChildDistribution =
     if (partial) {
@@ -56,8 +56,6 @@ case class Aggregate(
       }
     }
 
-  override def otherCopyArgs = sqlContext :: Nil
-
   // HACK: Generators don't correctly preserve their output through serializations so we grab
   // out child's output attributes statically here.
   private[this] val childOutput = child.output
@@ -138,7 +136,7 @@ case class Aggregate(
             i += 1
           }
         }
-        val resultProjection = new Projection(resultExpressions, computedSchema)
+        val resultProjection = new InterpretedProjection(resultExpressions, computedSchema)
         val aggregateResults = new GenericMutableRow(computedAggregates.length)
 
         var i = 0
@@ -152,7 +150,7 @@ case class Aggregate(
     } else {
       child.execute().mapPartitions { iter =>
         val hashTable = new HashMap[Row, Array[AggregateFunction]]
-        val groupingProjection = new MutableProjection(groupingExpressions, childOutput)
+        val groupingProjection = new InterpretedMutableProjection(groupingExpressions, childOutput)
 
         var currentRow: Row = null
         while (iter.hasNext) {
@@ -175,7 +173,8 @@ case class Aggregate(
           private[this] val hashTableIter = hashTable.entrySet().iterator()
           private[this] val aggregateResults = new GenericMutableRow(computedAggregates.length)
           private[this] val resultProjection =
-            new MutableProjection(resultExpressions, computedSchema ++ namedGroups.map(_._2))
+            new InterpretedMutableProjection(
+              resultExpressions, computedSchema ++ namedGroups.map(_._2))
           private[this] val joinedRow = new JoinedRow
 
           override final def hasNext: Boolean = hashTableIter.hasNext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 00010ef6e798a..77dc2ad733215 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -22,7 +22,7 @@ import org.apache.spark.{HashPartitioner, RangePartitioner, SparkConf}
 import org.apache.spark.rdd.ShuffledRDD
 import org.apache.spark.sql.{SQLContext, Row}
 import org.apache.spark.sql.catalyst.errors.attachTree
-import org.apache.spark.sql.catalyst.expressions.{NoBind, MutableProjection, RowOrdering}
+import org.apache.spark.sql.catalyst.expressions.RowOrdering
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.util.MutablePair
@@ -31,7 +31,7 @@ import org.apache.spark.util.MutablePair
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode with NoBind {
+case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode {
 
   override def outputPartitioning = newPartitioning
 
@@ -42,12 +42,14 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
       case HashPartitioning(expressions, numPartitions) =>
         // TODO: Eliminate redundant expressions in grouping key and value.
         val rdd = child.execute().mapPartitions { iter =>
-          val hashExpressions = new MutableProjection(expressions, child.output)
+          @transient val hashExpressions =
+            newMutableProjection(expressions, child.output)()
+
           val mutablePair = new MutablePair[Row, Row]()
           iter.map(r => mutablePair.update(hashExpressions(r), r))
         }
         val part = new HashPartitioner(numPartitions)
-        val shuffled = new ShuffledRDD[Row, Row, Row, MutablePair[Row, Row]](rdd, part)
+        val shuffled = new ShuffledRDD[Row, Row, Row](rdd, part)
         shuffled.setSerializer(new SparkSqlSerializer(new SparkConf(false)))
         shuffled.map(_._2)
 
@@ -60,7 +62,7 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
           iter.map(row => mutablePair.update(row, null))
         }
         val part = new RangePartitioner(numPartitions, rdd, ascending = true)
-        val shuffled = new ShuffledRDD[Row, Null, Null, MutablePair[Row, Null]](rdd, part)
+        val shuffled = new ShuffledRDD[Row, Null, Null](rdd, part)
         shuffled.setSerializer(new SparkSqlSerializer(new SparkConf(false)))
 
         shuffled.map(_._1)
@@ -71,7 +73,7 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
           iter.map(r => mutablePair.update(null, r))
         }
         val partitioner = new HashPartitioner(1)
-        val shuffled = new ShuffledRDD[Null, Row, Row, MutablePair[Null, Row]](rdd, partitioner)
+        val shuffled = new ShuffledRDD[Null, Row, Row](rdd, partitioner)
         shuffled.setSerializer(new SparkSqlSerializer(new SparkConf(false)))
         shuffled.map(_._2)
 
@@ -99,7 +101,7 @@ private[sql] case class AddExchange(sqlContext: SQLContext) extends Rule[SparkPl
         !operator.requiredChildDistribution.zip(operator.children).map {
           case (required, child) =>
             val valid = child.outputPartitioning.satisfies(required)
-            logger.debug(
+            logDebug(
               s"${if (valid) "Valid" else "Invalid"} distribution," +
                 s"required: $required current: ${child.outputPartitioning}")
             valid
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
index 47b3d00262dbb..c386fd121c5de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -47,23 +47,26 @@ case class Generate(
     }
   }
 
-  override def output =
+  // This must be a val since the generator output expr ids are not preserved by serialization.
+  override val output =
     if (join) child.output ++ generatorOutput else generatorOutput
 
+  val boundGenerator = BindReferences.bindReference(generator, child.output)
+
   override def execute() = {
     if (join) {
       child.execute().mapPartitions { iter =>
         val nullValues = Seq.fill(generator.output.size)(Literal(null))
         // Used to produce rows with no matches when outer = true.
         val outerProjection =
-          new Projection(child.output ++ nullValues, child.output)
+          newProjection(child.output ++ nullValues, child.output)
 
         val joinProjection =
-          new Projection(child.output ++ generator.output, child.output ++ generator.output)
+          newProjection(child.output ++ generator.output, child.output ++ generator.output)
         val joinedRow = new JoinedRow
 
         iter.flatMap {row =>
-          val outputRows = generator.eval(row)
+          val outputRows = boundGenerator.eval(row)
           if (outer && outputRows.isEmpty) {
             outerProjection(row) :: Nil
           } else {
@@ -72,7 +75,7 @@ case class Generate(
         }
       }
     } else {
-      child.execute().mapPartitions(iter => iter.flatMap(row => generator.eval(row)))
+      child.execute().mapPartitions(iter => iter.flatMap(row => boundGenerator.eval(row)))
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
new file mode 100644
index 0000000000000..4a26934c49c93
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.catalyst.types._
+
+case class AggregateEvaluation(
+    schema: Seq[Attribute],
+    initialValues: Seq[Expression],
+    update: Seq[Expression],
+    result: Expression)
+
+/**
+ * :: DeveloperApi ::
+ * Alternate version of aggregation that leverages projection and thus code generation.
+ * Aggregations are converted into a set of projections from a aggregation buffer tuple back onto
+ * itself. Currently only used for simple aggregations like SUM, COUNT, or AVERAGE are supported.
+ *
+ * @param partial if true then aggregation is done partially on local data without shuffling to
+ *                ensure all values where `groupingExpressions` are equal are present.
+ * @param groupingExpressions expressions that are evaluated to determine grouping.
+ * @param aggregateExpressions expressions that are computed for each group.
+ * @param child the input data source.
+ */
+@DeveloperApi
+case class GeneratedAggregate(
+    partial: Boolean,
+    groupingExpressions: Seq[Expression],
+    aggregateExpressions: Seq[NamedExpression],
+    child: SparkPlan)
+  extends UnaryNode {
+
+  override def requiredChildDistribution =
+    if (partial) {
+      UnspecifiedDistribution :: Nil
+    } else {
+      if (groupingExpressions == Nil) {
+        AllTuples :: Nil
+      } else {
+        ClusteredDistribution(groupingExpressions) :: Nil
+      }
+    }
+
+  override def output = aggregateExpressions.map(_.toAttribute)
+
+  override def execute() = {
+    val aggregatesToCompute = aggregateExpressions.flatMap { a =>
+      a.collect { case agg: AggregateExpression => agg}
+    }
+
+    val computeFunctions = aggregatesToCompute.map {
+      case c @ Count(expr) =>
+        val currentCount = AttributeReference("currentCount", LongType, nullable = false)()
+        val initialValue = Literal(0L)
+        val updateFunction = If(IsNotNull(expr), Add(currentCount, Literal(1L)), currentCount)
+        val result = currentCount
+
+        AggregateEvaluation(currentCount :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
+
+      case Sum(expr) =>
+        val currentSum = AttributeReference("currentSum", expr.dataType, nullable = false)()
+        val initialValue = Cast(Literal(0L), expr.dataType)
+
+        // Coalasce avoids double calculation...
+        // but really, common sub expression elimination would be better....
+        val updateFunction = Coalesce(Add(expr, currentSum) :: currentSum :: Nil)
+        val result = currentSum
+
+        AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
+
+      case a @ Average(expr) =>
+        val currentCount = AttributeReference("currentCount", LongType, nullable = false)()
+        val currentSum = AttributeReference("currentSum", expr.dataType, nullable = false)()
+        val initialCount = Literal(0L)
+        val initialSum = Cast(Literal(0L), expr.dataType)
+        val updateCount = If(IsNotNull(expr), Add(currentCount, Literal(1L)), currentCount)
+        val updateSum = Coalesce(Add(expr, currentSum) :: currentSum :: Nil)
+
+        val result = Divide(Cast(currentSum, DoubleType), Cast(currentCount, DoubleType))
+
+        AggregateEvaluation(
+          currentCount :: currentSum :: Nil,
+          initialCount :: initialSum :: Nil,
+          updateCount :: updateSum :: Nil,
+          result
+        )
+    }
+
+    val computationSchema = computeFunctions.flatMap(_.schema)
+
+    val resultMap: Map[Long, Expression] = aggregatesToCompute.zip(computeFunctions).map {
+      case (agg, func) => agg.id -> func.result
+    }.toMap
+
+    val namedGroups = groupingExpressions.zipWithIndex.map {
+      case (ne: NamedExpression, _) => (ne, ne)
+      case (e, i) => (e, Alias(e, s"GroupingExpr$i")())
+    }
+
+    val groupMap: Map[Expression, Attribute] =
+      namedGroups.map { case (k, v) => k -> v.toAttribute}.toMap
+
+    // The set of expressions that produce the final output given the aggregation buffer and the
+    // grouping expressions.
+    val resultExpressions = aggregateExpressions.map(_.transform {
+      case e: Expression if resultMap.contains(e.id) => resultMap(e.id)
+      case e: Expression if groupMap.contains(e) => groupMap(e)
+    })
+
+    child.execute().mapPartitions { iter =>
+      // Builds a new custom class for holding the results of aggregation for a group.
+      val initialValues = computeFunctions.flatMap(_.initialValues)
+      val newAggregationBuffer = newProjection(initialValues, child.output)
+      log.info(s"Initial values: ${initialValues.mkString(",")}")
+
+      // A projection that computes the group given an input tuple.
+      val groupProjection = newProjection(groupingExpressions, child.output)
+      log.info(s"Grouping Projection: ${groupingExpressions.mkString(",")}")
+
+      // A projection that is used to update the aggregate values for a group given a new tuple.
+      // This projection should be targeted at the current values for the group and then applied
+      // to a joined row of the current values with the new input row.
+      val updateExpressions = computeFunctions.flatMap(_.update)
+      val updateSchema = computeFunctions.flatMap(_.schema) ++ child.output
+      val updateProjection = newMutableProjection(updateExpressions, updateSchema)()
+      log.info(s"Update Expressions: ${updateExpressions.mkString(",")}")
+
+      // A projection that produces the final result, given a computation.
+      val resultProjectionBuilder =
+        newMutableProjection(
+          resultExpressions,
+          (namedGroups.map(_._2.toAttribute) ++ computationSchema).toSeq)
+      log.info(s"Result Projection: ${resultExpressions.mkString(",")}")
+
+      val joinedRow = new JoinedRow
+
+      if (groupingExpressions.isEmpty) {
+        // TODO: Codegening anything other than the updateProjection is probably over kill.
+        val buffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
+        var currentRow: Row = null
+        updateProjection.target(buffer)
+
+        while (iter.hasNext) {
+          currentRow = iter.next()
+          updateProjection(joinedRow(buffer, currentRow))
+        }
+
+        val resultProjection = resultProjectionBuilder()
+        Iterator(resultProjection(buffer))
+      } else {
+        val buffers = new java.util.HashMap[Row, MutableRow]()
+
+        var currentRow: Row = null
+        while (iter.hasNext) {
+          currentRow = iter.next()
+          val currentGroup = groupProjection(currentRow)
+          var currentBuffer = buffers.get(currentGroup)
+          if (currentBuffer == null) {
+            currentBuffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
+            buffers.put(currentGroup, currentBuffer)
+          }
+          // Target the projection at the current aggregation buffer and then project the updated
+          // values.
+          updateProjection.target(currentBuffer)(joinedRow(currentBuffer, currentRow))
+        }
+
+        new Iterator[Row] {
+          private[this] val resultIterator = buffers.entrySet.iterator()
+          private[this] val resultProjection = resultProjectionBuilder()
+
+          def hasNext = resultIterator.hasNext
+
+          def next() = {
+            val currentGroup = resultIterator.next()
+            resultProjection(joinedRow(currentGroup.getKey, currentGroup.getValue))
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 27dc091b85812..21cbbc9772a00 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -18,22 +18,55 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Logging, Row}
+
+
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.GenericRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.plans.logical.BaseRelation
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 
+
+object SparkPlan {
+  protected[sql] val currentContext = new ThreadLocal[SQLContext]()
+}
+
 /**
  * :: DeveloperApi ::
  */
 @DeveloperApi
-abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging {
+abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
   self: Product =>
 
+  /**
+   * A handle to the SQL Context that was used to create this plan.   Since many operators need
+   * access to the sqlContext for RDD operations or configuration this field is automatically
+   * populated by the query planning infrastructure.
+   */
+  @transient
+  protected val sqlContext = SparkPlan.currentContext.get()
+
+  protected def sparkContext = sqlContext.sparkContext
+
+  // sqlContext will be null when we are being deserialized on the slaves.  In this instance
+  // the value of codegenEnabled will be set by the desserializer after the constructor has run.
+  val codegenEnabled: Boolean = if (sqlContext != null) {
+    sqlContext.codegenEnabled
+  } else {
+    false
+  }
+
+  /** Overridden make copy also propogates sqlContext to copied plan. */
+  override def makeCopy(newArgs: Array[AnyRef]): this.type = {
+    SparkPlan.currentContext.set(sqlContext)
+    super.makeCopy(newArgs)
+  }
+
   // TODO: Move to `DistributedPlan`
   /** Specifies how data is partitioned across different nodes in the cluster. */
   def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH!
@@ -51,8 +84,46 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging {
    */
   def executeCollect(): Array[Row] = execute().map(_.copy()).collect()
 
-  protected def buildRow(values: Seq[Any]): Row =
-    new GenericRow(values.toArray)
+  protected def newProjection(
+      expressions: Seq[Expression], inputSchema: Seq[Attribute]): Projection = {
+    log.debug(
+      s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
+    if (codegenEnabled) {
+      GenerateProjection(expressions, inputSchema)
+    } else {
+      new InterpretedProjection(expressions, inputSchema)
+    }
+  }
+
+  protected def newMutableProjection(
+      expressions: Seq[Expression],
+      inputSchema: Seq[Attribute]): () => MutableProjection = {
+    log.debug(
+      s"Creating MutableProj: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
+    if(codegenEnabled) {
+      GenerateMutableProjection(expressions, inputSchema)
+    } else {
+      () => new InterpretedMutableProjection(expressions, inputSchema)
+    }
+  }
+
+
+  protected def newPredicate(
+      expression: Expression, inputSchema: Seq[Attribute]): (Row) => Boolean = {
+    if (codegenEnabled) {
+      GeneratePredicate(expression, inputSchema)
+    } else {
+      InterpretedPredicate(expression, inputSchema)
+    }
+  }
+
+  protected def newOrdering(order: Seq[SortOrder], inputSchema: Seq[Attribute]): Ordering[Row] = {
+    if (codegenEnabled) {
+      GenerateOrdering(order, inputSchema)
+    } else {
+      new RowOrdering(order, inputSchema)
+    }
+  }
 }
 
 /**
@@ -66,8 +137,8 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging {
  * linking.
  */
 @DeveloperApi
-case class SparkLogicalPlan(alreadyPlanned: SparkPlan, tableName: String = "SparkLogicalPlan")
-  extends BaseRelation with MultiInstanceRelation {
+case class SparkLogicalPlan(alreadyPlanned: SparkPlan)(@transient sqlContext: SQLContext)
+  extends LogicalPlan with MultiInstanceRelation {
 
   def output = alreadyPlanned.output
   override def references = Set.empty
@@ -78,9 +149,15 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan, tableName: String = "Spar
       alreadyPlanned match {
         case ExistingRdd(output, rdd) => ExistingRdd(output.map(_.newInstance), rdd)
         case _ => sys.error("Multiple instance of the same relation detected.")
-      }, tableName)
-      .asInstanceOf[this.type]
+      })(sqlContext).asInstanceOf[this.type]
   }
+
+  @transient override lazy val statistics = Statistics(
+    // TODO: Instead of returning a default value here, find a way to return a meaningful size
+    // estimate for RDDs. See PR 1238 for more discussions.
+    sizeInBytes = BigInt(sqlContext.defaultSizeInBytes)
+  )
+
 }
 
 private[sql] trait LeafNode extends SparkPlan with trees.LeafNode[SparkPlan] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala
index 34b355e906695..34654447a5f4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer.scala
@@ -24,10 +24,10 @@ import scala.reflect.ClassTag
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 import com.esotericsoftware.kryo.io.{Input, Output}
 import com.esotericsoftware.kryo.{Serializer, Kryo}
-import com.twitter.chill.AllScalaRegistrar
+import com.twitter.chill.{AllScalaRegistrar, ResourcePool}
 
 import org.apache.spark.{SparkEnv, SparkConf}
-import org.apache.spark.serializer.KryoSerializer
+import org.apache.spark.serializer.{SerializerInstance, KryoSerializer}
 import org.apache.spark.util.MutablePair
 import org.apache.spark.util.Utils
 
@@ -48,22 +48,41 @@ private[sql] class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(co
   }
 }
 
-private[sql] object SparkSqlSerializer {
-  // TODO (lian) Using KryoSerializer here is workaround, needs further investigation
-  // Using SparkSqlSerializer here makes BasicQuerySuite to fail because of Kryo serialization
-  // related error.
-  @transient lazy val ser: KryoSerializer = {
+private[execution] class KryoResourcePool(size: Int)
+    extends ResourcePool[SerializerInstance](size) {
+
+  val ser: KryoSerializer = {
     val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
+    // TODO (lian) Using KryoSerializer here is workaround, needs further investigation
+    // Using SparkSqlSerializer here makes BasicQuerySuite to fail because of Kryo serialization
+    // related error.
     new KryoSerializer(sparkConf)
   }
 
-  def serialize[T: ClassTag](o: T): Array[Byte] = {
-    ser.newInstance().serialize(o).array()
-  }
+  def newInstance() = ser.newInstance()
+}
 
-  def deserialize[T: ClassTag](bytes: Array[Byte]): T  = {
-    ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes))
+private[sql] object SparkSqlSerializer {
+  @transient lazy val resourcePool = new KryoResourcePool(30)
+
+  private[this] def acquireRelease[O](fn: SerializerInstance => O): O = {
+    val kryo = resourcePool.borrow
+    try {
+      fn(kryo)
+    } finally {
+      resourcePool.release(kryo)
+    }
   }
+
+  def serialize[T: ClassTag](o: T): Array[Byte] =
+    acquireRelease { k =>
+      k.serialize(o).array()
+    }
+
+  def deserialize[T: ClassTag](bytes: Array[Byte]): T =
+    acquireRelease { k =>
+      k.deserialize[T](ByteBuffer.wrap(bytes))
+    }
 }
 
 private[sql] class BigDecimalSerializer extends Serializer[BigDecimal] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 9e036e127bd56..f0c958fdb537f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.{SQLContext, execution}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{BaseRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.columnar.{InMemoryRelation, InMemoryColumnarTableScan}
 import org.apache.spark.sql.parquet._
@@ -39,7 +39,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // no predicate can be evaluated by matching hash keys
       case logical.Join(left, right, LeftSemi, condition) =>
         execution.LeftSemiJoinBNL(
-          planLater(left), planLater(right), condition)(sqlContext) :: Nil
+          planLater(left), planLater(right), condition) :: Nil
       case _ => Nil
     }
   }
@@ -47,9 +47,19 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   /**
    * Uses the ExtractEquiJoinKeys pattern to find joins where at least some of the predicates can be
    * evaluated by matching hash keys.
+   *
+   * This strategy applies a simple optimization based on the estimates of the physical sizes of
+   * the two join sides.  When planning a [[execution.BroadcastHashJoin]], if one side has an
+   * estimated physical size smaller than the user-settable threshold
+   * [[org.apache.spark.sql.SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]], the planner would mark it as the
+   * ''build'' relation and mark the other relation as the ''stream'' side.  The build table will be
+   * ''broadcasted'' to all of the executors involved in the join, as a
+   * [[org.apache.spark.broadcast.Broadcast]] object.  If both estimates exceed the threshold, they
+   * will instead be used to decide the build side in a [[execution.ShuffledHashJoin]].
    */
   object HashJoin extends Strategy with PredicateHelper {
-    private[this] def broadcastHashJoin(
+
+    private[this] def makeBroadcastHashJoin(
         leftKeys: Seq[Expression],
         rightKeys: Seq[Expression],
         left: LogicalPlan,
@@ -57,102 +67,102 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         condition: Option[Expression],
         side: BuildSide) = {
       val broadcastHashJoin = execution.BroadcastHashJoin(
-        leftKeys, rightKeys, side, planLater(left), planLater(right))(sqlContext)
+        leftKeys, rightKeys, side, planLater(left), planLater(right))
       condition.map(Filter(_, broadcastHashJoin)).getOrElse(broadcastHashJoin) :: Nil
     }
 
-    def broadcastTables: Seq[String] = sqlContext.joinBroadcastTables.split(",").toBuffer
-
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case ExtractEquiJoinKeys(
-              Inner,
-              leftKeys,
-              rightKeys,
-              condition,
-              left,
-              right @ PhysicalOperation(_, _, b: BaseRelation))
-        if broadcastTables.contains(b.tableName) =>
-          broadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildRight)
+      case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
+        if sqlContext.autoBroadcastJoinThreshold > 0 &&
+           right.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold =>
+        makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildRight)
 
-      case ExtractEquiJoinKeys(
-              Inner,
-              leftKeys,
-              rightKeys,
-              condition,
-              left @ PhysicalOperation(_, _, b: BaseRelation),
-              right)
-        if broadcastTables.contains(b.tableName) =>
-          broadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildLeft)
+      case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
+        if sqlContext.autoBroadcastJoinThreshold > 0 &&
+           left.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold =>
+          makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildLeft)
 
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) =>
+        val buildSide =
+          if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
+            BuildRight
+          } else {
+            BuildLeft
+          }
         val hashJoin =
           execution.ShuffledHashJoin(
-            leftKeys, rightKeys, BuildRight, planLater(left), planLater(right))
+            leftKeys, rightKeys, buildSide, planLater(left), planLater(right))
         condition.map(Filter(_, hashJoin)).getOrElse(hashJoin) :: Nil
 
+      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) =>
+        execution.HashOuterJoin(
+          leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil
+
       case _ => Nil
     }
   }
 
-  object PartialAggregation extends Strategy {
+  object HashAggregation extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.Aggregate(groupingExpressions, aggregateExpressions, child) =>
-        // Collect all aggregate expressions.
-        val allAggregates =
-          aggregateExpressions.flatMap(_ collect { case a: AggregateExpression => a })
-        // Collect all aggregate expressions that can be computed partially.
-        val partialAggregates =
-          aggregateExpressions.flatMap(_ collect { case p: PartialAggregate => p })
+      // Aggregations that can be performed in two phases, before and after the shuffle.
 
-        // Only do partial aggregation if supported by all aggregate expressions.
-        if (allAggregates.size == partialAggregates.size) {
-          // Create a map of expressions to their partial evaluations for all aggregate expressions.
-          val partialEvaluations: Map[Long, SplitEvaluation] =
-            partialAggregates.map(a => (a.id, a.asPartial)).toMap
-
-          // We need to pass all grouping expressions though so the grouping can happen a second
-          // time. However some of them might be unnamed so we alias them allowing them to be
-          // referenced in the second aggregation.
-          val namedGroupingExpressions: Map[Expression, NamedExpression] = groupingExpressions.map {
-            case n: NamedExpression => (n, n)
-            case other => (other, Alias(other, "PartialGroup")())
-          }.toMap
-
-          // Replace aggregations with a new expression that computes the result from the already
-          // computed partial evaluations and grouping values.
-          val rewrittenAggregateExpressions = aggregateExpressions.map(_.transformUp {
-            case e: Expression if partialEvaluations.contains(e.id) =>
-              partialEvaluations(e.id).finalEvaluation
-            case e: Expression if namedGroupingExpressions.contains(e) =>
-              namedGroupingExpressions(e).toAttribute
-          }).asInstanceOf[Seq[NamedExpression]]
-
-          val partialComputation =
-            (namedGroupingExpressions.values ++
-             partialEvaluations.values.flatMap(_.partialEvaluations)).toSeq
-
-          // Construct two phased aggregation.
-          execution.Aggregate(
+      // Cases where all aggregates can be codegened.
+      case PartialAggregation(
+             namedGroupingAttributes,
+             rewrittenAggregateExpressions,
+             groupingExpressions,
+             partialComputation,
+             child)
+             if canBeCodeGened(
+                  allAggregates(partialComputation) ++
+                  allAggregates(rewrittenAggregateExpressions)) &&
+               codegenEnabled =>
+          execution.GeneratedAggregate(
             partial = false,
-            namedGroupingExpressions.values.map(_.toAttribute).toSeq,
+            namedGroupingAttributes,
             rewrittenAggregateExpressions,
-            execution.Aggregate(
+            execution.GeneratedAggregate(
               partial = true,
               groupingExpressions,
               partialComputation,
-              planLater(child))(sqlContext))(sqlContext) :: Nil
-        } else {
-          Nil
-        }
+              planLater(child))) :: Nil
+
+      // Cases where some aggregate can not be codegened
+      case PartialAggregation(
+             namedGroupingAttributes,
+             rewrittenAggregateExpressions,
+             groupingExpressions,
+             partialComputation,
+             child) =>
+        execution.Aggregate(
+          partial = false,
+          namedGroupingAttributes,
+          rewrittenAggregateExpressions,
+          execution.Aggregate(
+            partial = true,
+            groupingExpressions,
+            partialComputation,
+            planLater(child))) :: Nil
+
       case _ => Nil
     }
+
+    def canBeCodeGened(aggs: Seq[AggregateExpression]) = !aggs.exists {
+      case _: Sum | _: Count => false
+      case _ => true
+    }
+
+    def allAggregates(exprs: Seq[Expression]) =
+      exprs.flatMap(_.collect { case a: AggregateExpression => a })
   }
 
   object BroadcastNestedLoopJoin extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Join(left, right, joinType, condition) =>
+        val buildSide =
+          if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) BuildRight else BuildLeft
         execution.BroadcastNestedLoopJoin(
-          planLater(left), planLater(right), joinType, condition)(sqlContext) :: Nil
+          planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
       case _ => Nil
     }
   }
@@ -171,16 +181,10 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   protected lazy val singleRowRdd =
     sparkContext.parallelize(Seq(new GenericRow(Array[Any]()): Row), 1)
 
-  def convertToCatalyst(a: Any): Any = a match {
-    case s: Seq[Any] => s.map(convertToCatalyst)
-    case p: Product => new GenericRow(p.productIterator.map(convertToCatalyst).toArray)
-    case other => other
-  }
-
   object TakeOrdered extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Limit(IntegerLiteral(limit), logical.Sort(order, child)) =>
-        execution.TakeOrdered(limit, order, planLater(child))(sqlContext) :: Nil
+        execution.TakeOrdered(limit, order, planLater(child)) :: Nil
       case _ => Nil
     }
   }
@@ -190,11 +194,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // TODO: need to support writing to other types of files.  Unify the below code paths.
       case logical.WriteToFile(path, child) =>
         val relation =
-          ParquetRelation.create(path, child, sparkContext.hadoopConfiguration)
+          ParquetRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
         // Note: overwrite=false because otherwise the metadata we just created will be deleted
-        InsertIntoParquetTable(relation, planLater(child), overwrite=false)(sqlContext) :: Nil
+        InsertIntoParquetTable(relation, planLater(child), overwrite = false) :: Nil
       case logical.InsertIntoTable(table: ParquetRelation, partition, child, overwrite) =>
-        InsertIntoParquetTable(table, planLater(child), overwrite)(sqlContext) :: Nil
+        InsertIntoParquetTable(table, planLater(child), overwrite) :: Nil
       case PhysicalOperation(projectList, filters: Seq[Expression], relation: ParquetRelation) =>
         val prunePushedDownFilters =
           if (sparkContext.conf.getBoolean(ParquetFilters.PARQUET_FILTER_PUSHDOWN_ENABLED, true)) {
@@ -223,7 +227,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           projectList,
           filters,
           prunePushedDownFilters,
-          ParquetTableScan(_, relation, filters)(sqlContext)) :: Nil
+          ParquetTableScan(_, relation, filters)) :: Nil
 
       case _ => Nil
     }
@@ -247,8 +251,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Distinct(child) =>
-        execution.Aggregate(
-          partial = false, child.output, child.output, planLater(child))(sqlContext) :: Nil
+        execution.Distinct(partial = false,
+          execution.Distinct(partial = true, planLater(child))) :: Nil
       case logical.Sort(sortExprs, child) =>
         // This sort is a global sort. Its requiredDistribution will be an OrderedDistribution.
         execution.Sort(sortExprs, global = true, planLater(child)):: Nil
@@ -261,27 +265,30 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.Filter(condition, child) =>
         execution.Filter(condition, planLater(child)) :: Nil
       case logical.Aggregate(group, agg, child) =>
-        execution.Aggregate(partial = false, group, agg, planLater(child))(sqlContext) :: Nil
+        execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
       case logical.Sample(fraction, withReplacement, seed, child) =>
         execution.Sample(fraction, withReplacement, seed, planLater(child)) :: Nil
       case logical.LocalRelation(output, data) =>
-        val dataAsRdd =
-          sparkContext.parallelize(data.map(r =>
-            new GenericRow(r.productIterator.map(convertToCatalyst).toArray): Row))
-        execution.ExistingRdd(output, dataAsRdd) :: Nil
+        ExistingRdd(
+          output,
+          ExistingRdd.productToRowRdd(sparkContext.parallelize(data, numPartitions))) :: Nil
       case logical.Limit(IntegerLiteral(limit), child) =>
-        execution.Limit(limit, planLater(child))(sqlContext) :: Nil
+        execution.Limit(limit, planLater(child)) :: Nil
       case Unions(unionChildren) =>
-        execution.Union(unionChildren.map(planLater))(sqlContext) :: Nil
-      case logical.Except(left,right) =>                                        
-        execution.Except(planLater(left),planLater(right)) :: Nil   
+        execution.Union(unionChildren.map(planLater)) :: Nil
+      case logical.Except(left, right) =>
+        execution.Except(planLater(left), planLater(right)) :: Nil
+      case logical.Intersect(left, right) =>
+        execution.Intersect(planLater(left), planLater(right)) :: Nil
       case logical.Generate(generator, join, outer, _, child) =>
         execution.Generate(generator, join = join, outer = outer, planLater(child)) :: Nil
       case logical.NoRelation =>
         execution.ExistingRdd(Nil, singleRowRdd) :: Nil
       case logical.Repartition(expressions, child) =>
         execution.Exchange(HashPartitioning(expressions, numPartitions), planLater(child)) :: Nil
-      case SparkLogicalPlan(existingPlan, _) => existingPlan :: Nil
+      case e @ EvaluatePython(udf, child) =>
+        BatchPythonEvaluation(udf, e.output, planLater(child)) :: Nil
+      case SparkLogicalPlan(existingPlan) => existingPlan :: Nil
       case _ => Nil
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 4b59e0b4e58e4..0027f3cf1fc79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.DeveloperApi
@@ -26,7 +27,7 @@ import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{OrderedDistribution, UnspecifiedDistribution}
+import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, OrderedDistribution, UnspecifiedDistribution}
 import org.apache.spark.util.MutablePair
 
 /**
@@ -36,9 +37,11 @@ import org.apache.spark.util.MutablePair
 case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
   override def output = projectList.map(_.toAttribute)
 
-  override def execute() = child.execute().mapPartitions { iter =>
-    @transient val reusableProjection = new MutableProjection(projectList)
-    iter.map(reusableProjection)
+  @transient lazy val buildProjection = newMutableProjection(projectList, child.output)
+
+  def execute() = child.execute().mapPartitions { iter =>
+    val resuableProjection = buildProjection()
+    iter.map(resuableProjection)
   }
 }
 
@@ -49,8 +52,10 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
 case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
   override def output = child.output
 
-  override def execute() = child.execute().mapPartitions { iter =>
-    iter.filter(condition.eval(_).asInstanceOf[Boolean])
+  @transient lazy val conditionEvaluator = newPredicate(condition, child.output)
+
+  def execute() = child.execute().mapPartitions { iter =>
+    iter.filter(conditionEvaluator)
   }
 }
 
@@ -71,33 +76,71 @@ case class Sample(fraction: Double, withReplacement: Boolean, seed: Long, child:
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class Union(children: Seq[SparkPlan])(@transient sqlContext: SQLContext) extends SparkPlan {
+case class Union(children: Seq[SparkPlan]) extends SparkPlan {
   // TODO: attributes output by union should be distinct for nullability purposes
   override def output = children.head.output
-  override def execute() = sqlContext.sparkContext.union(children.map(_.execute()))
-
-  override def otherCopyArgs = sqlContext :: Nil
+  override def execute() = sparkContext.union(children.map(_.execute()))
 }
 
 /**
  * :: DeveloperApi ::
  * Take the first limit elements. Note that the implementation is different depending on whether
  * this is a terminal operator or not. If it is terminal and is invoked using executeCollect,
- * this operator uses Spark's take method on the Spark driver. If it is not terminal or is
- * invoked using execute, we first take the limit on each partition, and then repartition all the
- * data to a single partition to compute the global limit.
+ * this operator uses something similar to Spark's take method on the Spark driver. If it is not
+ * terminal or is invoked using execute, we first take the limit on each partition, and then
+ * repartition all the data to a single partition to compute the global limit.
  */
 @DeveloperApi
-case class Limit(limit: Int, child: SparkPlan)(@transient sqlContext: SQLContext)
+case class Limit(limit: Int, child: SparkPlan)
   extends UnaryNode {
   // TODO: Implement a partition local limit, and use a strategy to generate the proper limit plan:
   // partition local limit -> exchange into one partition -> partition local limit again
 
-  override def otherCopyArgs = sqlContext :: Nil
-
   override def output = child.output
 
-  override def executeCollect() = child.execute().map(_.copy()).take(limit)
+  /**
+   * A custom implementation modeled after the take function on RDDs but which never runs any job
+   * locally.  This is to avoid shipping an entire partition of data in order to retrieve only a few
+   * rows.
+   */
+  override def executeCollect(): Array[Row] = {
+    if (limit == 0) {
+      return new Array[Row](0)
+    }
+
+    val childRDD = child.execute().map(_.copy())
+
+    val buf = new ArrayBuffer[Row]
+    val totalParts = childRDD.partitions.length
+    var partsScanned = 0
+    while (buf.size < limit && partsScanned < totalParts) {
+      // The number of partitions to try in this iteration. It is ok for this number to be
+      // greater than totalParts because we actually cap it at totalParts in runJob.
+      var numPartsToTry = 1
+      if (partsScanned > 0) {
+        // If we didn't find any rows after the first iteration, just try all partitions next.
+        // Otherwise, interpolate the number of partitions we need to try, but overestimate it
+        // by 50%.
+        if (buf.size == 0) {
+          numPartsToTry = totalParts - 1
+        } else {
+          numPartsToTry = (1.5 * limit * partsScanned / buf.size).toInt
+        }
+      }
+      numPartsToTry = math.max(0, numPartsToTry)  // guard against negative num of partitions
+
+      val left = limit - buf.size
+      val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts)
+      val sc = sqlContext.sparkContext
+      val res =
+        sc.runJob(childRDD, (it: Iterator[Row]) => it.take(left).toArray, p, allowLocal = false)
+
+      res.foreach(buf ++= _.take(limit - buf.size))
+      partsScanned += numPartsToTry
+    }
+
+    buf.toArray
+  }
 
   override def execute() = {
     val rdd = child.execute().mapPartitions { iter =>
@@ -105,7 +148,7 @@ case class Limit(limit: Int, child: SparkPlan)(@transient sqlContext: SQLContext
       iter.take(limit).map(row => mutablePair.update(false, row))
     }
     val part = new HashPartitioner(1)
-    val shuffled = new ShuffledRDD[Boolean, Row, Row, MutablePair[Boolean, Row]](rdd, part)
+    val shuffled = new ShuffledRDD[Boolean, Row, Row](rdd, part)
     shuffled.setSerializer(new SparkSqlSerializer(new SparkConf(false)))
     shuffled.mapPartitions(_.take(limit).map(_._2))
   }
@@ -118,20 +161,18 @@ case class Limit(limit: Int, child: SparkPlan)(@transient sqlContext: SQLContext
  * Spark's top operator does the opposite in ordering so we name it TakeOrdered to avoid confusion.
  */
 @DeveloperApi
-case class TakeOrdered(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan)
-                      (@transient sqlContext: SQLContext) extends UnaryNode {
-  override def otherCopyArgs = sqlContext :: Nil
+case class TakeOrdered(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan) extends UnaryNode {
 
   override def output = child.output
 
-  @transient
-  lazy val ordering = new RowOrdering(sortOrder)
+  val ordering = new RowOrdering(sortOrder, child.output)
 
+  // TODO: Is this copying for no reason?
   override def executeCollect() = child.execute().map(_.copy()).takeOrdered(limit)(ordering)
 
   // TODO: Terminal split should be implemented differently from non-terminal split.
   // TODO: Pick num splits based on |limit|.
-  override def execute() = sqlContext.sparkContext.makeRDD(executeCollect(), 1)
+  override def execute() = sparkContext.makeRDD(executeCollect(), 1)
 }
 
 /**
@@ -146,15 +187,13 @@ case class Sort(
   override def requiredChildDistribution =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
-  @transient
-  lazy val ordering = new RowOrdering(sortOrder)
 
   override def execute() = attachTree(this, "sort") {
-    // TODO: Optimize sorting operation?
     child.execute()
-      .mapPartitions(
-        iterator => iterator.map(_.copy()).toArray.sorted(ordering).iterator,
-        preservesPartitioning = true)
+      .mapPartitions( { iterator =>
+        val ordering = newOrdering(sortOrder, child.output)
+        iterator.map(_.copy()).toArray.sorted(ordering).iterator
+    }, preservesPartitioning = true)
   }
 
   override def output = child.output
@@ -205,6 +244,37 @@ object ExistingRdd {
 case class ExistingRdd(output: Seq[Attribute], rdd: RDD[Row]) extends LeafNode {
   override def execute() = rdd
 }
+/**
+ * :: DeveloperApi ::
+ * Computes the set of distinct input rows using a HashSet.
+ * @param partial when true the distinct operation is performed partially, per partition, without
+ *                shuffling the data.
+ * @param child the input query plan.
+ */
+@DeveloperApi
+case class Distinct(partial: Boolean, child: SparkPlan) extends UnaryNode {
+  override def output = child.output
+
+  override def requiredChildDistribution =
+    if (partial) UnspecifiedDistribution :: Nil else ClusteredDistribution(child.output) :: Nil
+
+  override def execute() = {
+    child.execute().mapPartitions { iter =>
+      val hashSet = new scala.collection.mutable.HashSet[Row]()
+
+      var currentRow: Row = null
+      while (iter.hasNext) {
+        currentRow = iter.next()
+        if (!hashSet.contains(currentRow)) {
+          hashSet.add(currentRow.copy())
+        }
+      }
+
+      hashSet.iterator
+    }
+  }
+}
+
 
 /**
  * :: DeveloperApi ::
@@ -220,3 +290,16 @@ case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   }
 }
 
+/**
+ * :: DeveloperApi ::
+ * Returns the rows in left that also appear in right using the built in spark
+ * intersection function.
+ */
+@DeveloperApi
+case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
+  override def output = children.head.output
+
+  override def execute() = {
+    left.execute().map(_.copy()).intersection(right.execute().map(_.copy()))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 98d2f89c8ae71..9293239131d52 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRow}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{Row, SQLConf, SQLContext}
 
 trait Command {
   /**
@@ -44,28 +45,53 @@ trait Command {
 case class SetCommand(
     key: Option[String], value: Option[String], output: Seq[Attribute])(
     @transient context: SQLContext)
-  extends LeafNode with Command {
+  extends LeafNode with Command with Logging {
 
-  override protected[sql] lazy val sideEffectResult: Seq[(String, String)] = (key, value) match {
+  override protected[sql] lazy val sideEffectResult: Seq[String] = (key, value) match {
     // Set value for key k.
     case (Some(k), Some(v)) =>
-      context.set(k, v)
-      Array(k -> v)
+      if (k == SQLConf.Deprecated.MAPRED_REDUCE_TASKS) {
+        logWarning(s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " +
+          s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS} instead.")
+        context.set(SQLConf.SHUFFLE_PARTITIONS, v)
+        Array(s"${SQLConf.SHUFFLE_PARTITIONS}=$v")
+      } else {
+        context.set(k, v)
+        Array(s"$k=$v")
+      }
 
     // Query the value bound to key k.
     case (Some(k), _) =>
-      Array(k -> context.getOption(k).getOrElse("<undefined>"))
+      // TODO (lian) This is just a workaround to make the Simba ODBC driver work.
+      // Should remove this once we get the ODBC driver updated.
+      if (k == "-v") {
+        val hiveJars = Seq(
+          "hive-exec-0.12.0.jar",
+          "hive-service-0.12.0.jar",
+          "hive-common-0.12.0.jar",
+          "hive-hwi-0.12.0.jar",
+          "hive-0.12.0.jar").mkString(":")
+
+        Array(
+          "system:java.class.path=" + hiveJars,
+          "system:sun.java.command=shark.SharkServer2")
+      }
+      else {
+        Array(s"$k=${context.getOption(k).getOrElse("<undefined>")}")
+      }
 
     // Query all key-value pairs that are set in the SQLConf of the context.
     case (None, None) =>
-      context.getAll
+      context.getAll.map { case (k, v) =>
+        s"$k=$v"
+      }
 
     case _ =>
       throw new IllegalArgumentException()
   }
 
   def execute(): RDD[Row] = {
-    val rows = sideEffectResult.map { case (k, v) => new GenericRow(Array[Any](k, v)) }
+    val rows = sideEffectResult.map { line => new GenericRow(Array[Any](line)) }
     context.sparkContext.parallelize(rows, 1)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index c6fbd6d2f6930..f31df051824d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -41,13 +41,13 @@ package object debug {
    */
   @DeveloperApi
   implicit class DebugQuery(query: SchemaRDD) {
-    def debug(implicit sc: SparkContext): Unit = {
+    def debug(): Unit = {
       val plan = query.queryExecution.executedPlan
       val visited = new collection.mutable.HashSet[Long]()
       val debugPlan = plan transform {
         case s: SparkPlan if !visited.contains(s.id) =>
           visited += s.id
-          DebugNode(sc, s)
+          DebugNode(s)
       }
       println(s"Results returned: ${debugPlan.execute().count()}")
       debugPlan.foreach {
@@ -57,9 +57,7 @@ package object debug {
     }
   }
 
-  private[sql] case class DebugNode(
-      @transient sparkContext: SparkContext,
-      child: SparkPlan) extends UnaryNode {
+  private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode {
     def references = Set.empty
 
     def output = child.output
@@ -107,7 +105,9 @@ package object debug {
             var i = 0
             while (i < numColumns) {
               val value = currentRow(i)
-              columnStats(i).elementTypes += HashSet(value.getClass.getName)
+              if (value != null) {
+                columnStats(i).elementTypes += HashSet(value.getClass.getName)
+              }
               i += 1
             }
             currentRow
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
index 7d1f11caae838..51bb61530744c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
@@ -38,6 +38,8 @@ case object BuildLeft extends BuildSide
 case object BuildRight extends BuildSide
 
 trait HashJoin {
+  self: SparkPlan =>
+
   val leftKeys: Seq[Expression]
   val rightKeys: Seq[Expression]
   val buildSide: BuildSide
@@ -56,9 +58,9 @@ trait HashJoin {
 
   def output = left.output ++ right.output
 
-  @transient lazy val buildSideKeyGenerator = new Projection(buildKeys, buildPlan.output)
+  @transient lazy val buildSideKeyGenerator = newProjection(buildKeys, buildPlan.output)
   @transient lazy val streamSideKeyGenerator =
-    () => new MutableProjection(streamedKeys, streamedPlan.output)
+    newMutableProjection(streamedKeys, streamedPlan.output)
 
   def joinIterators(buildIter: Iterator[Row], streamIter: Iterator[Row]): Iterator[Row] = {
     // TODO: Use Spark's HashMap implementation.
@@ -70,7 +72,7 @@ trait HashJoin {
     while (buildIter.hasNext) {
       currentRow = buildIter.next()
       val rowKey = buildSideKeyGenerator(currentRow)
-      if(!rowKey.anyNull) {
+      if (!rowKey.anyNull) {
         val existingMatchList = hashTable.get(rowKey)
         val matchList = if (existingMatchList == null) {
           val newMatchList = new ArrayBuffer[Row]()
@@ -134,6 +136,190 @@ trait HashJoin {
   }
 }
 
+/**
+ * Constant Value for Binary Join Node
+ */
+object HashOuterJoin {
+  val DUMMY_LIST = Seq[Row](null)
+  val EMPTY_LIST = Seq[Row]()
+}
+
+/**
+ * :: DeveloperApi ::
+ * Performs a hash based outer join for two child relations by shuffling the data using 
+ * the join keys. This operator requires loading the associated partition in both side into memory.
+ */
+@DeveloperApi
+case class HashOuterJoin(
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    joinType: JoinType,
+    condition: Option[Expression],
+    left: SparkPlan,
+    right: SparkPlan) extends BinaryNode {
+
+  override def outputPartitioning: Partitioning = joinType match {
+    case LeftOuter => left.outputPartitioning
+    case RightOuter => right.outputPartitioning
+    case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
+    case x => throw new Exception(s"HashOuterJoin should not take $x as the JoinType")
+  }
+
+  override def requiredChildDistribution =
+    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+
+  def output = left.output ++ right.output
+
+  // TODO we need to rewrite all of the iterators with our own implementation instead of the Scala
+  // iterator for performance purpose. 
+
+  private[this] def leftOuterIterator(
+      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = {
+    val joinedRow = new JoinedRow()
+    val rightNullRow = new GenericRow(right.output.length)
+    val boundCondition = 
+      condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+
+    leftIter.iterator.flatMap { l => 
+      joinedRow.withLeft(l)
+      var matched = false
+      (if (!key.anyNull) rightIter.collect { case r if (boundCondition(joinedRow.withRight(r))) => 
+        matched = true
+        joinedRow.copy
+      } else {
+        Nil
+      }) ++ HashOuterJoin.DUMMY_LIST.filter(_ => !matched).map( _ => {
+        // HashOuterJoin.DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row,
+        // as we don't know whether we need to append it until finish iterating all of the 
+        // records in right side.
+        // If we didn't get any proper row, then append a single row with empty right
+        joinedRow.withRight(rightNullRow).copy
+      })
+    }
+  }
+
+  private[this] def rightOuterIterator(
+      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = {
+    val joinedRow = new JoinedRow()
+    val leftNullRow = new GenericRow(left.output.length)
+    val boundCondition = 
+      condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+
+    rightIter.iterator.flatMap { r => 
+      joinedRow.withRight(r)
+      var matched = false
+      (if (!key.anyNull) leftIter.collect { case l if (boundCondition(joinedRow.withLeft(l))) => 
+        matched = true
+        joinedRow.copy
+      } else {
+        Nil
+      }) ++ HashOuterJoin.DUMMY_LIST.filter(_ => !matched).map( _ => {
+        // HashOuterJoin.DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row,
+        // as we don't know whether we need to append it until finish iterating all of the 
+        // records in left side.
+        // If we didn't get any proper row, then append a single row with empty left.
+        joinedRow.withLeft(leftNullRow).copy
+      })
+    }
+  }
+
+  private[this] def fullOuterIterator(
+      key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = {
+    val joinedRow = new JoinedRow()
+    val leftNullRow = new GenericRow(left.output.length)
+    val rightNullRow = new GenericRow(right.output.length)
+    val boundCondition = 
+      condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+
+    if (!key.anyNull) {
+      // Store the positions of records in right, if one of its associated row satisfy
+      // the join condition.
+      val rightMatchedSet = scala.collection.mutable.Set[Int]()
+      leftIter.iterator.flatMap[Row] { l =>
+        joinedRow.withLeft(l)
+        var matched = false
+        rightIter.zipWithIndex.collect { 
+          // 1. For those matched (satisfy the join condition) records with both sides filled, 
+          //    append them directly
+
+          case (r, idx) if (boundCondition(joinedRow.withRight(r)))=> {
+            matched = true
+            // if the row satisfy the join condition, add its index into the matched set
+            rightMatchedSet.add(idx)
+            joinedRow.copy
+          }
+        } ++ HashOuterJoin.DUMMY_LIST.filter(_ => !matched).map( _ => {
+          // 2. For those unmatched records in left, append additional records with empty right.
+
+          // HashOuterJoin.DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row,
+          // as we don't know whether we need to append it until finish iterating all 
+          // of the records in right side.
+          // If we didn't get any proper row, then append a single row with empty right.
+          joinedRow.withRight(rightNullRow).copy
+        })
+      } ++ rightIter.zipWithIndex.collect {
+        // 3. For those unmatched records in right, append additional records with empty left.
+
+        // Re-visiting the records in right, and append additional row with empty left, if its not 
+        // in the matched set. 
+        case (r, idx) if (!rightMatchedSet.contains(idx)) => {
+          joinedRow(leftNullRow, r).copy
+        }
+      }
+    } else {
+      leftIter.iterator.map[Row] { l =>
+        joinedRow(l, rightNullRow).copy
+      } ++ rightIter.iterator.map[Row] { r =>
+        joinedRow(leftNullRow, r).copy
+      }
+    }
+  }
+
+  private[this] def buildHashTable(
+      iter: Iterator[Row], keyGenerator: Projection): Map[Row, ArrayBuffer[Row]] = {
+    // TODO: Use Spark's HashMap implementation.
+    val hashTable = scala.collection.mutable.Map[Row, ArrayBuffer[Row]]()
+    while (iter.hasNext) {
+      val currentRow = iter.next()
+      val rowKey = keyGenerator(currentRow)
+
+      val existingMatchList = hashTable.getOrElseUpdate(rowKey, {new ArrayBuffer[Row]()})
+      existingMatchList += currentRow.copy()
+    }
+    
+    hashTable.toMap[Row, ArrayBuffer[Row]]
+  }
+
+  def execute() = {
+    left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
+      // TODO this probably can be replaced by external sort (sort merged join?)
+      // Build HashMap for current partition in left relation
+      val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
+      // Build HashMap for current partition in right relation
+      val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
+
+      val boundCondition = 
+        condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true)
+      joinType match {
+        case LeftOuter => leftHashTable.keysIterator.flatMap { key =>
+          leftOuterIterator(key, leftHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST), 
+            rightHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST))
+        }
+        case RightOuter => rightHashTable.keysIterator.flatMap { key =>
+          rightOuterIterator(key, leftHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST), 
+            rightHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST))
+        }
+        case FullOuter => (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
+          fullOuterIterator(key, 
+            leftHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST), 
+            rightHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST))
+        }
+        case x => throw new Exception(s"HashOuterJoin should not take $x as the JoinType")
+      }
+    }
+  }
+}
+
 /**
  * :: DeveloperApi ::
  * Performs an inner hash join of two child relations by first shuffling the data using the join
@@ -187,7 +373,7 @@ case class LeftSemiJoinHash(
       while (buildIter.hasNext) {
         currentRow = buildIter.next()
         val rowKey = buildSideKeyGenerator(currentRow)
-        if(!rowKey.anyNull) {
+        if (!rowKey.anyNull) {
           val keyExists = hashSet.contains(rowKey)
           if (!keyExists) {
             hashSet.add(rowKey)
@@ -217,18 +403,16 @@ case class BroadcastHashJoin(
      rightKeys: Seq[Expression],
      buildSide: BuildSide,
      left: SparkPlan,
-     right: SparkPlan)(@transient sqlContext: SQLContext) extends BinaryNode with HashJoin {
-
-  override def otherCopyArgs = sqlContext :: Nil
+     right: SparkPlan) extends BinaryNode with HashJoin {
 
-  override def outputPartitioning: Partitioning = left.outputPartitioning
+  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning
 
   override def requiredChildDistribution =
     UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
 
   @transient
   lazy val broadcastFuture = future {
-    sqlContext.sparkContext.broadcast(buildPlan.executeCollect())
+    sparkContext.broadcast(buildPlan.executeCollect())
   }
 
   def execute() = {
@@ -248,14 +432,11 @@ case class BroadcastHashJoin(
 @DeveloperApi
 case class LeftSemiJoinBNL(
     streamed: SparkPlan, broadcast: SparkPlan, condition: Option[Expression])
-    (@transient sqlContext: SQLContext)
   extends BinaryNode {
   // TODO: Override requiredChildDistribution.
 
   override def outputPartitioning: Partitioning = streamed.outputPartitioning
 
-  override def otherCopyArgs = sqlContext :: Nil
-
   def output = left.output
 
   /** The Streamed Relation */
@@ -271,7 +452,7 @@ case class LeftSemiJoinBNL(
 
   def execute() = {
     val broadcastedRelation =
-      sqlContext.sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
+      sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
 
     streamed.execute().mapPartitions { streamedIter =>
       val joinedRow = new JoinedRow
@@ -300,8 +481,14 @@ case class LeftSemiJoinBNL(
 case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   def output = left.output ++ right.output
 
-  def execute() = left.execute().map(_.copy()).cartesian(right.execute().map(_.copy())).map {
-    case (l: Row, r: Row) => buildRow(l ++ r)
+  def execute() = {
+    val leftResults = left.execute().map(_.copy())
+    val rightResults = right.execute().map(_.copy())
+
+    leftResults.cartesian(rightResults).mapPartitions { iter =>
+      val joinedRow = new JoinedRow
+      iter.map(r => joinedRow(r._1, r._2))
+    }
   }
 }
 
@@ -310,14 +497,20 @@ case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNod
  */
 @DeveloperApi
 case class BroadcastNestedLoopJoin(
-    streamed: SparkPlan, broadcast: SparkPlan, joinType: JoinType, condition: Option[Expression])
-    (@transient sqlContext: SQLContext)
-  extends BinaryNode {
+    left: SparkPlan,
+    right: SparkPlan,
+    buildSide: BuildSide,
+    joinType: JoinType,
+    condition: Option[Expression]) extends BinaryNode {
   // TODO: Override requiredChildDistribution.
 
-  override def outputPartitioning: Partitioning = streamed.outputPartitioning
+  /** BuildRight means the right relation <=> the broadcast relation. */
+  val (streamed, broadcast) = buildSide match {
+    case BuildRight => (left, right)
+    case BuildLeft => (right, left)
+  }
 
-  override def otherCopyArgs = sqlContext :: Nil
+  override def outputPartitioning: Partitioning = streamed.outputPartitioning
 
   override def output = {
     joinType match {
@@ -332,11 +525,6 @@ case class BroadcastNestedLoopJoin(
     }
   }
 
-  /** The Streamed Relation */
-  def left = streamed
-  /** The Broadcast relation */
-  def right = broadcast
-
   @transient lazy val boundCondition =
     InterpretedPredicate(
       condition
@@ -345,58 +533,80 @@ case class BroadcastNestedLoopJoin(
 
   def execute() = {
     val broadcastedRelation =
-      sqlContext.sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
+      sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
 
-    val streamedPlusMatches = streamed.execute().mapPartitions { streamedIter =>
+    /** All rows that either match both-way, or rows from streamed joined with nulls. */
+    val matchesOrStreamedRowsWithNulls = streamed.execute().mapPartitions { streamedIter =>
       val matchedRows = new ArrayBuffer[Row]
       // TODO: Use Spark's BitSet.
-      val includedBroadcastTuples = new BitSet(broadcastedRelation.value.size)
+      val includedBroadcastTuples =
+        new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
       val joinedRow = new JoinedRow
+      val leftNulls = new GenericMutableRow(left.output.size)
+      val rightNulls = new GenericMutableRow(right.output.size)
 
       streamedIter.foreach { streamedRow =>
         var i = 0
-        var matched = false
+        var streamRowMatched = false
 
         while (i < broadcastedRelation.value.size) {
           // TODO: One bitset per partition instead of per row.
           val broadcastedRow = broadcastedRelation.value(i)
-          if (boundCondition(joinedRow(streamedRow, broadcastedRow))) {
-            matchedRows += buildRow(streamedRow ++ broadcastedRow)
-            matched = true
-            includedBroadcastTuples += i
+          buildSide match {
+            case BuildRight if boundCondition(joinedRow(streamedRow, broadcastedRow)) =>
+              matchedRows += joinedRow(streamedRow, broadcastedRow).copy()
+              streamRowMatched = true
+              includedBroadcastTuples += i
+            case BuildLeft if boundCondition(joinedRow(broadcastedRow, streamedRow)) =>
+              matchedRows += joinedRow(broadcastedRow, streamedRow).copy()
+              streamRowMatched = true
+              includedBroadcastTuples += i
+            case _ =>
           }
           i += 1
         }
 
-        if (!matched && (joinType == LeftOuter || joinType == FullOuter)) {
-          matchedRows += buildRow(streamedRow ++ Array.fill(right.output.size)(null))
+        (streamRowMatched, joinType, buildSide) match {
+          case (false, LeftOuter | FullOuter, BuildRight) =>
+            matchedRows += joinedRow(streamedRow, rightNulls).copy()
+          case (false, RightOuter | FullOuter, BuildLeft) =>
+            matchedRows += joinedRow(leftNulls, streamedRow).copy()
+          case _ =>
         }
       }
       Iterator((matchedRows, includedBroadcastTuples))
     }
 
-    val includedBroadcastTuples = streamedPlusMatches.map(_._2)
+    val includedBroadcastTuples = matchesOrStreamedRowsWithNulls.map(_._2)
     val allIncludedBroadcastTuples =
       if (includedBroadcastTuples.count == 0) {
         new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
       } else {
-        streamedPlusMatches.map(_._2).reduce(_ ++ _)
+        includedBroadcastTuples.reduce(_ ++ _)
       }
 
-    val rightOuterMatches: Seq[Row] =
-      if (joinType == RightOuter || joinType == FullOuter) {
-        broadcastedRelation.value.zipWithIndex.filter {
-          case (row, i) => !allIncludedBroadcastTuples.contains(i)
-        }.map {
-          // TODO: Use projection.
-          case (row, _) => buildRow(Vector.fill(left.output.size)(null) ++ row)
+    val leftNulls = new GenericMutableRow(left.output.size)
+    val rightNulls = new GenericMutableRow(right.output.size)
+    /** Rows from broadcasted joined with nulls. */
+    val broadcastRowsWithNulls: Seq[Row] = {
+      val arrBuf: collection.mutable.ArrayBuffer[Row] = collection.mutable.ArrayBuffer()
+      var i = 0
+      val rel = broadcastedRelation.value
+      while (i < rel.length) {
+        if (!allIncludedBroadcastTuples.contains(i)) {
+          (joinType, buildSide) match {
+            case (RightOuter | FullOuter, BuildRight) => arrBuf += new JoinedRow(leftNulls, rel(i))
+            case (LeftOuter | FullOuter, BuildLeft) => arrBuf += new JoinedRow(rel(i), rightNulls)
+            case _ =>
+          }
         }
-      } else {
-        Vector()
+        i += 1
       }
+      arrBuf.toSeq
+    }
 
     // TODO: Breaks lineage.
-    sqlContext.sparkContext.union(
-      streamedPlusMatches.flatMap(_._1), sqlContext.sparkContext.makeRDD(rightOuterMatches))
+    sparkContext.union(
+      matchesOrStreamedRowsWithNulls.flatMap(_._1), sparkContext.makeRDD(broadcastRowsWithNulls))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
new file mode 100644
index 0000000000000..b92091b560b1c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -0,0 +1,177 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution
+
+import java.util.{List => JList, Map => JMap}
+
+import net.razorvine.pickle.{Pickler, Unpickler}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.{Accumulator, Logging => SparkLogging}
+
+import scala.collection.JavaConversions._
+
+/**
+ * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
+ */
+private[spark] case class PythonUDF(
+    name: String,
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    pythonExec: String,
+    accumulator: Accumulator[JList[Array[Byte]]],
+    dataType: DataType,
+    children: Seq[Expression]) extends Expression with SparkLogging {
+
+  override def toString = s"PythonUDF#$name(${children.mkString(",")})"
+
+  def nullable: Boolean = true
+  def references: Set[Attribute] = children.flatMap(_.references).toSet
+
+  override def eval(input: Row) = sys.error("PythonUDFs can not be directly evaluated.")
+}
+
+/**
+ * Extracts PythonUDFs from operators, rewriting the query plan so that the UDF can be evaluated
+ * alone in a batch.
+ *
+ * This has the limitation that the input to the Python UDF is not allowed include attributes from
+ * multiple child operators.
+ */
+private[spark] object ExtractPythonUdfs extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan) = plan transform {
+    // Skip EvaluatePython nodes.
+    case p: EvaluatePython => p
+
+    case l: LogicalPlan =>
+      // Extract any PythonUDFs from the current operator.
+      val udfs = l.expressions.flatMap(_.collect { case udf: PythonUDF => udf})
+      if (udfs.isEmpty) {
+        // If there aren't any, we are done.
+        l
+      } else {
+        // Pick the UDF we are going to evaluate (TODO: Support evaluating multiple UDFs at a time)
+        // If there is more than one, we will add another evaluation operator in a subsequent pass.
+        val udf = udfs.head
+
+        var evaluation: EvaluatePython = null
+
+        // Rewrite the child that has the input required for the UDF
+        val newChildren = l.children.map { child =>
+          // Check to make sure that the UDF can be evaluated with only the input of this child.
+          // Other cases are disallowed as they are ambiguous or would require a cartisian product.
+          if (udf.references.subsetOf(child.outputSet)) {
+            evaluation = EvaluatePython(udf, child)
+            evaluation
+          } else if (udf.references.intersect(child.outputSet).nonEmpty) {
+            sys.error(s"Invalid PythonUDF $udf, requires attributes from more than one child.")
+          } else {
+            child
+          }
+        }
+
+        assert(evaluation != null, "Unable to evaluate PythonUDF.  Missing input attributes.")
+
+        // Trim away the new UDF value if it was only used for filtering or something.
+        logical.Project(
+          l.output,
+          l.transformExpressions {
+            case p: PythonUDF if p.id == udf.id => evaluation.resultAttribute
+          }.withNewChildren(newChildren))
+      }
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Evaluates a [[PythonUDF]], appending the result to the end of the input tuple.
+ */
+@DeveloperApi
+case class EvaluatePython(udf: PythonUDF, child: LogicalPlan) extends logical.UnaryNode {
+  val resultAttribute = AttributeReference("pythonUDF", udf.dataType, nullable=true)()
+
+  def references = Set.empty
+  def output = child.output :+ resultAttribute
+}
+
+/**
+ * :: DeveloperApi ::
+ * Uses PythonRDD to evaluate a [[PythonUDF]], one partition of tuples at a time.  The input
+ * data is cached and zipped with the result of the udf evaluation.
+ */
+@DeveloperApi
+case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child: SparkPlan)
+  extends SparkPlan {
+  def children = child :: Nil
+
+  def execute() = {
+    // TODO: Clean up after ourselves?
+    val childResults = child.execute().map(_.copy()).cache()
+
+    val parent = childResults.mapPartitions { iter =>
+      val pickle = new Pickler
+      val currentRow = newMutableProjection(udf.children, child.output)()
+      iter.grouped(1000).map { inputRows =>
+        val toBePickled = inputRows.map(currentRow(_).toArray).toArray
+        pickle.dumps(toBePickled)
+      }
+    }
+
+    val pyRDD = new PythonRDD(
+      parent,
+      udf.command,
+      udf.envVars,
+      udf.pythonIncludes,
+      false,
+      udf.pythonExec,
+      Seq[Broadcast[Array[Byte]]](),
+      udf.accumulator
+    ).mapPartitions { iter =>
+      val pickle = new Unpickler
+      iter.flatMap { pickedResult =>
+        val unpickledBatch = pickle.loads(pickedResult)
+        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]]
+      }
+    }.mapPartitions { iter =>
+      val row = new GenericMutableRow(1)
+      iter.map { result =>
+        row(0) = udf.dataType match {
+          case StringType => result.toString
+          case other => result
+        }
+        row: Row
+      }
+    }
+
+    childResults.zip(pyRDD).mapPartitions { iter =>
+      val joinedRow = new JoinedRow()
+      iter.map {
+        case (row, udfResult) =>
+          joinedRow(row, udfResult)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index edf86775579d8..a3d2a1c7a51f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.json
 
-import scala.collection.JavaConversions._
+import scala.collection.Map
+import scala.collection.convert.Wrappers.{JMapWrapper, JListWrapper}
 import scala.math.BigDecimal
 
 import com.fasterxml.jackson.databind.ObjectMapper
@@ -25,30 +26,25 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.execution.{ExistingRdd, SparkLogicalPlan}
-import org.apache.spark.sql.Logging
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.Logging
 
 private[sql] object JsonRDD extends Logging {
 
+  private[sql] def jsonStringToRow(
+      json: RDD[String],
+      schema: StructType): RDD[Row] = {
+    parseJson(json).map(parsed => asRow(parsed, schema))
+  }
+
   private[sql] def inferSchema(
       json: RDD[String],
-      samplingRatio: Double = 1.0): LogicalPlan = {
+      samplingRatio: Double = 1.0): StructType = {
     require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0")
     val schemaData = if (samplingRatio > 0.99) json else json.sample(false, samplingRatio, 1)
     val allKeys = parseJson(schemaData).map(allKeysWithValueTypes).reduce(_ ++ _)
-    val baseSchema = createSchema(allKeys)
-
-    createLogicalPlan(json, baseSchema)
-  }
-
-  private def createLogicalPlan(
-      json: RDD[String],
-      baseSchema: StructType): LogicalPlan = {
-    val schema = nullTypeToStringType(baseSchema)
-
-    SparkLogicalPlan(ExistingRdd(asAttributes(schema), parseJson(json).map(asRow(_, schema))))
+    createSchema(allKeys)
   }
 
   private def createSchema(allKeys: Set[(String, DataType)]): StructType = {
@@ -72,8 +68,8 @@ private[sql] object JsonRDD extends Logging {
       val (topLevel, structLike) = values.partition(_.size == 1)
       val topLevelFields = topLevel.filter {
         name => resolved.get(prefix ++ name).get match {
-          case ArrayType(StructType(Nil)) => false
-          case ArrayType(_) => true
+          case ArrayType(StructType(Nil), _) => false
+          case ArrayType(_, _) => true
           case struct: StructType => false
           case _ => true
         }
@@ -87,7 +83,8 @@ private[sql] object JsonRDD extends Logging {
           val structType = makeStruct(nestedFields, prefix :+ name)
           val dataType = resolved.get(prefix :+ name).get
           dataType match {
-            case array: ArrayType => Some(StructField(name, ArrayType(structType), nullable = true))
+            case array: ArrayType =>
+              Some(StructField(name, ArrayType(structType, array.containsNull), nullable = true))
             case struct: StructType => Some(StructField(name, structType, nullable = true))
             // dataType is StringType means that we have resolved type conflicts involving
             // primitive types and complex types. So, the type of name has been relaxed to
@@ -106,6 +103,22 @@ private[sql] object JsonRDD extends Logging {
     makeStruct(resolved.keySet.toSeq, Nil)
   }
 
+  private[sql] def nullTypeToStringType(struct: StructType): StructType = {
+    val fields = struct.fields.map {
+      case StructField(fieldName, dataType, nullable) => {
+        val newType = dataType match {
+          case NullType => StringType
+          case ArrayType(NullType, containsNull) => ArrayType(StringType, containsNull)
+          case struct: StructType => nullTypeToStringType(struct)
+          case other: DataType => other
+        }
+        StructField(fieldName, newType, nullable)
+      }
+    }
+
+    StructType(fields)
+  }
+
   /**
    * Returns the most general data type for two given data types.
    */
@@ -136,8 +149,8 @@ private[sql] object JsonRDD extends Logging {
             case StructField(name, _, _) => name
           })
         }
-        case (ArrayType(elementType1), ArrayType(elementType2)) =>
-          ArrayType(compatibleType(elementType1, elementType2))
+        case (ArrayType(elementType1, containsNull1), ArrayType(elementType2, containsNull2)) =>
+          ArrayType(compatibleType(elementType1, elementType2), containsNull1 || containsNull2)
         // TODO: We should use JsonObjectStringType to mark that values of field will be
         // strings and every string is a Json object.
         case (_, _) => StringType
@@ -145,18 +158,13 @@ private[sql] object JsonRDD extends Logging {
     }
   }
 
-  private def typeOfPrimitiveValue(value: Any): DataType = {
-    value match {
-      case value: java.lang.String => StringType
-      case value: java.lang.Integer => IntegerType
-      case value: java.lang.Long => LongType
+  private def typeOfPrimitiveValue: PartialFunction[Any, DataType] = {
+    ScalaReflection.typeOfObject orElse {
       // Since we do not have a data type backed by BigInteger,
       // when we see a Java BigInteger, we use DecimalType.
       case value: java.math.BigInteger => DecimalType
-      case value: java.lang.Double => DoubleType
+      // DecimalType's JVMType is scala BigDecimal.
       case value: java.math.BigDecimal => DecimalType
-      case value: java.lang.Boolean => BooleanType
-      case null => NullType
       // Unexpected data type.
       case _ => StringType
     }
@@ -169,12 +177,13 @@ private[sql] object JsonRDD extends Logging {
    * treat the element as String.
    */
   private def typeOfArray(l: Seq[Any]): ArrayType = {
+    val containsNull = l.exists(v => v == null)
     val elements = l.flatMap(v => Option(v))
     if (elements.isEmpty) {
       // If this JSON array is empty, we use NullType as a placeholder.
       // If this array is not empty in other JSON objects, we can resolve
       // the type after we have passed through all JSON objects.
-      ArrayType(NullType)
+      ArrayType(NullType, containsNull)
     } else {
       val elementType = elements.map {
         e => e match {
@@ -186,7 +195,7 @@ private[sql] object JsonRDD extends Logging {
         }
       }.reduce((type1: DataType, type2: DataType) => compatibleType(type1, type2))
 
-      ArrayType(elementType)
+      ArrayType(elementType, containsNull)
     }
   }
 
@@ -198,49 +207,51 @@ private[sql] object JsonRDD extends Logging {
    * in this JSON object can appear in other JSON objects.
    */
   private def allKeysWithValueTypes(m: Map[String, Any]): Set[(String, DataType)] = {
-    m.map{
+    val keyValuePairs = m.map {
       // Quote the key with backticks to handle cases which have dots
       // in the field name.
-      case (key, dataType) => (s"`$key`", dataType)
-    }.flatMap {
-      case (key: String, struct: Map[String, Any]) => {
-        // The value associted with the key is an JSON object.
-        allKeysWithValueTypes(struct).map {
+      case (key, value) => (s"`$key`", value)
+    }.toSet
+    keyValuePairs.flatMap {
+      case (key: String, struct: Map[_, _]) => {
+        // The value associated with the key is an JSON object.
+        allKeysWithValueTypes(struct.asInstanceOf[Map[String, Any]]).map {
           case (k, dataType) => (s"$key.$k", dataType)
         } ++ Set((key, StructType(Nil)))
       }
-      case (key: String, array: List[Any]) => {
-        // The value associted with the key is an array.
+      case (key: String, array: Seq[_]) => {
+        // The value associated with the key is an array.
         typeOfArray(array) match {
-          case ArrayType(StructType(Nil)) => {
+          case ArrayType(StructType(Nil), containsNull) => {
             // The elements of this arrays are structs.
-            array.asInstanceOf[List[Map[String, Any]]].flatMap {
+            array.asInstanceOf[Seq[Map[String, Any]]].flatMap {
               element => allKeysWithValueTypes(element)
             }.map {
               case (k, dataType) => (s"$key.$k", dataType)
-            } :+ (key, ArrayType(StructType(Nil)))
+            } :+ (key, ArrayType(StructType(Nil), containsNull))
           }
-          case ArrayType(elementType) => (key, ArrayType(elementType)) :: Nil
+          case ArrayType(elementType, containsNull) =>
+            (key, ArrayType(elementType, containsNull)) :: Nil
         }
       }
       case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
-    }.toSet
+    }
   }
 
   /**
-   * Converts a Java Map/List to a Scala Map/List.
+   * Converts a Java Map/List to a Scala Map/Seq.
    * We do not use Jackson's scala module at here because
    * DefaultScalaModule in jackson-module-scala will make
    * the parsing very slow.
    */
   private def scalafy(obj: Any): Any = obj match {
-    case map: java.util.Map[String, Object] =>
+    case map: java.util.Map[_, _] =>
       // .map(identity) is used as a workaround of non-serializable Map
       // generated by .mapValues.
       // This issue is documented at https://issues.scala-lang.org/browse/SI-7005
-      map.toMap.mapValues(scalafy).map(identity)
-    case list: java.util.List[Object] =>
-      list.toList.map(scalafy)
+      JMapWrapper(map).mapValues(scalafy).map(identity)
+    case list: java.util.List[_] =>
+      JListWrapper(list).map(scalafy)
     case atom => atom
   }
 
@@ -258,8 +269,11 @@ private[sql] object JsonRDD extends Logging {
       // the ObjectMapper will take the last value associated with this duplicate key.
       // For example: for {"key": 1, "key":2}, we will get "key"->2.
       val mapper = new ObjectMapper()
-      iter.map(record => mapper.readValue(record, classOf[java.util.Map[String, Any]]))
-      }).map(scalafy).map(_.asInstanceOf[Map[String, Any]])
+      iter.map { record =>
+        val parsed = scalafy(mapper.readValue(record, classOf[java.util.Map[String, Any]]))
+        parsed.asInstanceOf[Map[String, Any]]
+      }
+    })
   }
 
   private def toLong(value: Any): Long = {
@@ -319,8 +333,8 @@ private[sql] object JsonRDD extends Logging {
 
   private def toString(value: Any): String = {
     value match {
-      case value: Map[String, Any] => toJsonObjectString(value)
-      case value: Seq[Any] => toJsonArrayString(value)
+      case value: Map[_, _] => toJsonObjectString(value.asInstanceOf[Map[String, Any]])
+      case value: Seq[_] => toJsonArrayString(value)
       case value => Option(value).map(_.toString).orNull
     }
   }
@@ -330,7 +344,7 @@ private[sql] object JsonRDD extends Logging {
       null
     } else {
       desiredType match {
-        case ArrayType(elementType) =>
+        case ArrayType(elementType, _) =>
           value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
         case StringType => toString(value)
         case IntegerType => value.asInstanceOf[IntegerType.JvmType]
@@ -344,6 +358,7 @@ private[sql] object JsonRDD extends Logging {
   }
 
   private def asRow(json: Map[String,Any], schema: StructType): Row = {
+    // TODO: Reuse the row instead of creating a new one for every record.
     val row = new GenericMutableRow(schema.fields.length)
     schema.fields.zipWithIndex.foreach {
       // StructType
@@ -352,7 +367,7 @@ private[sql] object JsonRDD extends Logging {
           v => asRow(v.asInstanceOf[Map[String, Any]], fields)).orNull)
 
       // ArrayType(StructType)
-      case (StructField(name, ArrayType(structType: StructType), _), i) =>
+      case (StructField(name, ArrayType(structType: StructType, _), _), i) =>
         row.update(i,
           json.get(name).flatMap(v => Option(v)).map(
             v => v.asInstanceOf[Seq[Any]].map(
@@ -366,32 +381,4 @@ private[sql] object JsonRDD extends Logging {
 
     row
   }
-
-  private def nullTypeToStringType(struct: StructType): StructType = {
-    val fields = struct.fields.map {
-      case StructField(fieldName, dataType, nullable) => {
-        val newType = dataType match {
-          case NullType => StringType
-          case ArrayType(NullType) => ArrayType(StringType)
-          case struct: StructType => nullTypeToStringType(struct)
-          case other: DataType => other
-        }
-        StructField(fieldName, newType, nullable)
-      }
-    }
-
-    StructType(fields)
-  }
-
-  private def asAttributes(struct: StructType): Seq[AttributeReference] = {
-    struct.fields.map(f => AttributeReference(f.name, f.dataType, nullable = true)())
-  }
-
-  private def asStruct(attributes: Seq[AttributeReference]): StructType = {
-    val fields = attributes.map {
-      case AttributeReference(name, dataType, nullable) => StructField(name, dataType, nullable)
-    }
-
-    StructType(fields)
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
new file mode 100644
index 0000000000000..f513eae9c2d13
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.apache.spark.annotation.DeveloperApi
+
+/**
+ * Allows the execution of relational queries, including those expressed in SQL using Spark.
+ *
+ *  @groupname dataType Data types
+ *  @groupdesc Spark SQL data types.
+ *  @groupprio dataType -3
+ *  @groupname field Field
+ *  @groupprio field -2
+ *  @groupname row Row
+ *  @groupprio row -1
+ */
+package object sql {
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * Represents one row of output from a relational operator.
+   * @group row
+   */
+  @DeveloperApi
+  type Row = catalyst.expressions.Row
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * A [[Row]] object can be constructed by providing field values. Example:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * // Create a Row from values.
+   * Row(value1, value2, value3, ...)
+   * // Create a Row from a Seq of values.
+   * Row.fromSeq(Seq(value1, value2, ...))
+   * }}}
+   *
+   * A value of a row can be accessed through both generic access by ordinal,
+   * which will incur boxing overhead for primitives, as well as native primitive access.
+   * An example of generic access by ordinal:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * val row = Row(1, true, "a string", null)
+   * // row: Row = [1,true,a string,null]
+   * val firstValue = row(0)
+   * // firstValue: Any = 1
+   * val fourthValue = row(3)
+   * // fourthValue: Any = null
+   * }}}
+   *
+   * For native primitive access, it is invalid to use the native primitive interface to retrieve
+   * a value that is null, instead a user must check `isNullAt` before attempting to retrieve a
+   * value that might be null.
+   * An example of native primitive access:
+   * {{{
+   * // using the row from the previous example.
+   * val firstValue = row.getInt(0)
+   * // firstValue: Int = 1
+   * val isNull = row.isNullAt(3)
+   * // isNull: Boolean = true
+   * }}}
+   *
+   * Interfaces related to native primitive access are:
+   *
+   * `isNullAt(i: Int): Boolean`
+   *
+   * `getInt(i: Int): Int`
+   *
+   * `getLong(i: Int): Long`
+   *
+   * `getDouble(i: Int): Double`
+   *
+   * `getFloat(i: Int): Float`
+   *
+   * `getBoolean(i: Int): Boolean`
+   *
+   * `getShort(i: Int): Short`
+   *
+   * `getByte(i: Int): Byte`
+   *
+   * `getString(i: Int): String`
+   *
+   * Fields in a [[Row]] object can be extracted in a pattern match. Example:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * val pairs = sql("SELECT key, value FROM src").rdd.map {
+   *   case Row(key: Int, value: String) =>
+   *     key -> value
+   * }
+   * }}}
+   *
+   * @group row
+   */
+  @DeveloperApi
+  val Row = catalyst.expressions.Row
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The base type of all Spark SQL data types.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  type DataType = catalyst.types.DataType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `String` values
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val StringType = catalyst.types.StringType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Array[Byte]` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val BinaryType = catalyst.types.BinaryType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Boolean` values.
+   *
+   *@group dataType
+   */
+  @DeveloperApi
+  val BooleanType = catalyst.types.BooleanType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `java.sql.Timestamp` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val TimestampType = catalyst.types.TimestampType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `scala.math.BigDecimal` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val DecimalType = catalyst.types.DecimalType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Double` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val DoubleType = catalyst.types.DoubleType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Float` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val FloatType = catalyst.types.FloatType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Byte` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val ByteType = catalyst.types.ByteType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Int` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val IntegerType = catalyst.types.IntegerType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Long` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val LongType = catalyst.types.LongType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Short` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val ShortType = catalyst.types.ShortType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type for collections of multiple values.
+   * Internally these are represented as columns that contain a ``scala.collection.Seq``.
+   *
+   * An [[ArrayType]] object comprises two fields, `elementType: [[DataType]]` and
+   * `containsNull: Boolean`. The field of `elementType` is used to specify the type of
+   * array elements. The field of `containsNull` is used to specify if the array has `null` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  type ArrayType = catalyst.types.ArrayType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * An [[ArrayType]] object can be constructed with two ways,
+   * {{{
+   * ArrayType(elementType: DataType, containsNull: Boolean)
+   * }}} and
+   * {{{
+   * ArrayType(elementType: DataType)
+   * }}}
+   * For `ArrayType(elementType)`, the field of `containsNull` is set to `false`.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val ArrayType = catalyst.types.ArrayType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing `Map`s. A [[MapType]] object comprises three fields,
+   * `keyType: [[DataType]]`, `valueType: [[DataType]]` and `valueContainsNull: Boolean`.
+   * The field of `keyType` is used to specify the type of keys in the map.
+   * The field of `valueType` is used to specify the type of values in the map.
+   * The field of `valueContainsNull` is used to specify if values of this map has `null` values.
+   * For values of a MapType column, keys are not allowed to have `null` values.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  type MapType = catalyst.types.MapType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * A [[MapType]] object can be constructed with two ways,
+   * {{{
+   * MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean)
+   * }}} and
+   * {{{
+   * MapType(keyType: DataType, valueType: DataType)
+   * }}}
+   * For `MapType(keyType: DataType, valueType: DataType)`,
+   * the field of `valueContainsNull` is set to `true`.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val MapType = catalyst.types.MapType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * The data type representing [[Row]]s.
+   * A [[StructType]] object comprises a [[Seq]] of [[StructField]]s.
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  type StructType = catalyst.types.StructType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * A [[StructType]] object can be constructed by
+   * {{{
+   * StructType(fields: Seq[StructField])
+   * }}}
+   * For a [[StructType]] object, one or multiple [[StructField]]s can be extracted by names.
+   * If multiple [[StructField]]s are extracted, a [[StructType]] object will be returned.
+   * If a provided name does not have a matching field, it will be ignored. For the case
+   * of extracting a single StructField, a `null` will be returned.
+   * Example:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * val struct =
+   *   StructType(
+   *     StructField("a", IntegerType, true) ::
+   *     StructField("b", LongType, false) ::
+   *     StructField("c", BooleanType, false) :: Nil)
+   *
+   * // Extract a single StructField.
+   * val singleField = struct("b")
+   * // singleField: StructField = StructField(b,LongType,false)
+   *
+   * // This struct does not have a field called "d". null will be returned.
+   * val nonExisting = struct("d")
+   * // nonExisting: StructField = null
+   *
+   * // Extract multiple StructFields. Field names are provided in a set.
+   * // A StructType object will be returned.
+   * val twoFields = struct(Set("b", "c"))
+   * // twoFields: StructType =
+   * //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))
+   *
+   * // Those names do not have matching fields will be ignored.
+   * // For the case shown below, "d" will be ignored and
+   * // it is treated as struct(Set("b", "c")).
+   * val ignoreNonExisting = struct(Set("b", "c", "d"))
+   * // ignoreNonExisting: StructType =
+   * //   StructType(List(StructField(b,LongType,false), StructField(c,BooleanType,false)))
+   * }}}
+   *
+   * A [[Row]] object is used as a value of the StructType.
+   * Example:
+   * {{{
+   * import org.apache.spark.sql._
+   *
+   * val innerStruct =
+   *   StructType(
+   *     StructField("f1", IntegerType, true) ::
+   *     StructField("f2", LongType, false) ::
+   *     StructField("f3", BooleanType, false) :: Nil)
+   *
+   * val struct = StructType(
+   *   StructField("a", innerStruct, true) :: Nil)
+   *
+   * // Create a Row with the schema defined by struct
+   * val row = Row(Row(1, 2, true))
+   * // row: Row = [[1,2,true]]
+   * }}}
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val StructType = catalyst.types.StructType
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * A [[StructField]] object represents a field in a [[StructType]] object.
+   * A [[StructField]] object comprises three fields, `name: [[String]]`, `dataType: [[DataType]]`,
+   * and `nullable: Boolean`. The field of `name` is the name of a `StructField`. The field of
+   * `dataType` specifies the data type of a `StructField`.
+   * The field of `nullable` specifies if values of a `StructField` can contain `null` values.
+   *
+   * @group field
+   */
+  @DeveloperApi
+  type StructField = catalyst.types.StructField
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * A [[StructField]] object can be constructed by
+   * {{{
+   * StructField(name: String, dataType: DataType, nullable: Boolean)
+   * }}}
+   *
+   * @group dataType
+   */
+  @DeveloperApi
+  val StructField = catalyst.types.StructField
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 889a408e3c393..0a3b59cbc233a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -75,21 +75,21 @@ private[sql] object CatalystConverter {
     val fieldType: DataType = field.dataType
     fieldType match {
       // For native JVM types we use a converter with native arrays
-      case ArrayType(elementType: NativeType) => {
+      case ArrayType(elementType: NativeType, false) => {
         new CatalystNativeArrayConverter(elementType, fieldIndex, parent)
       }
       // This is for other types of arrays, including those with nested fields
-      case ArrayType(elementType: DataType) => {
+      case ArrayType(elementType: DataType, false) => {
         new CatalystArrayConverter(elementType, fieldIndex, parent)
       }
       case StructType(fields: Seq[StructField]) => {
         new CatalystStructConverter(fields.toArray, fieldIndex, parent)
       }
-      case MapType(keyType: DataType, valueType: DataType) => {
+      case MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean) => {
         new CatalystMapConverter(
           Array(
             new FieldType(MAP_KEY_SCHEMA_NAME, keyType, false),
-            new FieldType(MAP_VALUE_SCHEMA_NAME, valueType, true)),
+            new FieldType(MAP_VALUE_SCHEMA_NAME, valueType, valueContainsNull)),
           fieldIndex,
           parent)
       }
@@ -114,7 +114,7 @@ private[sql] object CatalystConverter {
         }
       }
       // All other primitive types use the default converter
-      case ctype: NativeType => { // note: need the type tag here!
+      case ctype: PrimitiveType => { // note: need the type tag here!
         new CatalystPrimitiveConverter(parent, fieldIndex)
       }
       case _ => throw new RuntimeException(
@@ -229,9 +229,9 @@ private[parquet] class CatalystGroupConverter(
     this(attributes.map(a => new FieldType(a.name, a.dataType, a.nullable)), 0, null)
 
   protected [parquet] val converters: Array[Converter] =
-    schema.map(field =>
-      CatalystConverter.createConverter(field, schema.indexOf(field), this))
-    .toArray
+    schema.zipWithIndex.map {
+      case (field, idx) => CatalystConverter.createConverter(field, idx, this)
+    }.toArray
 
   override val size = schema.size
 
@@ -288,9 +288,9 @@ private[parquet] class CatalystPrimitiveRowConverter(
       new ParquetRelation.RowType(attributes.length))
 
   protected [parquet] val converters: Array[Converter] =
-    schema.map(field =>
-      CatalystConverter.createConverter(field, schema.indexOf(field), this))
-      .toArray
+    schema.zipWithIndex.map {
+      case (field, idx) => CatalystConverter.createConverter(field, idx, this)
+    }.toArray
 
   override val size = schema.size
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 9c4771d1a9846..b3bae5db0edbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -27,6 +27,7 @@ import parquet.hadoop.ParquetOutputFormat
 import parquet.hadoop.metadata.CompressionCodecName
 import parquet.schema.MessageType
 
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedException}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
@@ -45,7 +46,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
  */
 private[sql] case class ParquetRelation(
     path: String,
-    @transient conf: Option[Configuration] = None) extends LeafNode with MultiInstanceRelation {
+    @transient conf: Option[Configuration],
+    @transient sqlContext: SQLContext)
+  extends LeafNode with MultiInstanceRelation {
 
   self: Product =>
 
@@ -59,7 +62,7 @@ private[sql] case class ParquetRelation(
   /** Attributes */
   override val output = ParquetTypesConverter.readSchemaFromFile(new Path(path), conf)
 
-  override def newInstance = ParquetRelation(path).asInstanceOf[this.type]
+  override def newInstance = ParquetRelation(path, conf, sqlContext).asInstanceOf[this.type]
 
   // Equals must also take into account the output attributes so that we can distinguish between
   // different instances of the same relation,
@@ -68,6 +71,9 @@ private[sql] case class ParquetRelation(
       p.path == path && p.output == output
     case _ => false
   }
+
+  // TODO: Use data from the footers.
+  override lazy val statistics = Statistics(sizeInBytes = sqlContext.defaultSizeInBytes)
 }
 
 private[sql] object ParquetRelation {
@@ -104,13 +110,14 @@ private[sql] object ParquetRelation {
    */
   def create(pathString: String,
              child: LogicalPlan,
-             conf: Configuration): ParquetRelation = {
+             conf: Configuration,
+             sqlContext: SQLContext): ParquetRelation = {
     if (!child.resolved) {
       throw new UnresolvedException[LogicalPlan](
         child,
         "Attempt to create Parquet table from unresolved child (when schema is not available)")
     }
-    createEmpty(pathString, child.output, false, conf)
+    createEmpty(pathString, child.output, false, conf, sqlContext)
   }
 
   /**
@@ -125,14 +132,15 @@ private[sql] object ParquetRelation {
   def createEmpty(pathString: String,
                   attributes: Seq[Attribute],
                   allowExisting: Boolean,
-                  conf: Configuration): ParquetRelation = {
+                  conf: Configuration,
+                  sqlContext: SQLContext): ParquetRelation = {
     val path = checkPath(pathString, allowExisting, conf)
     if (conf.get(ParquetOutputFormat.COMPRESSION) == null) {
       conf.set(ParquetOutputFormat.COMPRESSION, ParquetRelation.defaultCompression.name())
     }
     ParquetRelation.enableLogForwarding()
     ParquetTypesConverter.writeMetaData(attributes, path, conf)
-    new ParquetRelation(path.toString, Some(conf)) {
+    new ParquetRelation(path.toString, Some(conf), sqlContext) {
       override val output = attributes
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index ade823b51c9cd..759a2a586b926 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -17,54 +17,65 @@
 
 package org.apache.spark.sql.parquet
 
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.util.Try
+
 import java.io.IOException
+import java.lang.{Long => JLong}
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, List => JList}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat, FileOutputCommitter}
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat}
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
-import parquet.hadoop.{ParquetRecordReader, ParquetInputFormat, ParquetOutputFormat}
-import parquet.hadoop.api.ReadSupport
+import parquet.hadoop._
+import parquet.hadoop.api.{InitContext, ReadSupport}
+import parquet.hadoop.metadata.GlobalMetaData
 import parquet.hadoop.util.ContextUtil
-import parquet.io.InvalidRecordException
+import parquet.io.ParquetDecodingException
 import parquet.schema.MessageType
 
-import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
 import org.apache.spark.sql.execution.{LeafNode, SparkPlan, UnaryNode}
+import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 
 /**
  * Parquet table scan operator. Imports the file that backs the given
  * [[org.apache.spark.sql.parquet.ParquetRelation]] as a ``RDD[Row]``.
  */
 case class ParquetTableScan(
-    // note: output cannot be transient, see
-    // https://issues.apache.org/jira/browse/SPARK-1367
-    output: Seq[Attribute],
+    attributes: Seq[Attribute],
     relation: ParquetRelation,
-    columnPruningPred: Seq[Expression])(
-    @transient val sqlContext: SQLContext)
+    columnPruningPred: Seq[Expression])
   extends LeafNode {
 
+  // The resolution of Parquet attributes is case sensitive, so we resolve the original attributes
+  // by exprId. note: output cannot be transient, see
+  // https://issues.apache.org/jira/browse/SPARK-1367
+  val output = attributes.map { a =>
+    relation.output
+      .find(o => o.exprId == a.exprId)
+      .getOrElse(sys.error(s"Invalid parquet attribute $a in ${relation.output.mkString(",")}"))
+  }
+
   override def execute(): RDD[Row] = {
     val sc = sqlContext.sparkContext
     val job = new Job(sc.hadoopConfiguration)
-    ParquetInputFormat.setReadSupportClass(
-      job,
-      classOf[org.apache.spark.sql.parquet.RowReadSupport])
+    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
+
     val conf: Configuration = ContextUtil.getConfiguration(job)
-    val fileList = FileSystemHelper.listFiles(relation.path, conf)
-    // add all paths in the directory but skip "hidden" ones such
-    // as "_SUCCESS" and "_metadata"
-    for (path <- fileList if !path.getName.startsWith("_")) {
-      NewFileInputFormat.addInputPath(job, path)
+    val qualifiedPath = {
+      val path = new Path(relation.path)
+      path.getFileSystem(conf).makeQualified(path)
     }
+    NewFileInputFormat.addInputPath(job, qualifiedPath)
 
     // Store both requested and original schema in `Configuration`
     conf.set(
@@ -87,15 +98,13 @@ case class ParquetTableScan(
 
     sc.newAPIHadoopRDD(
       conf,
-      classOf[org.apache.spark.sql.parquet.FilteringParquetRowInputFormat],
+      classOf[FilteringParquetRowInputFormat],
       classOf[Void],
       classOf[Row])
       .map(_._2)
       .filter(_ != null) // Parquet's record filters may produce null values
   }
 
-  override def otherCopyArgs = sqlContext :: Nil
-
   /**
    * Applies a (candidate) projection.
    *
@@ -105,10 +114,9 @@ case class ParquetTableScan(
   def pruneColumns(prunedAttributes: Seq[Attribute]): ParquetTableScan = {
     val success = validateProjection(prunedAttributes)
     if (success) {
-      ParquetTableScan(prunedAttributes, relation, columnPruningPred)(sqlContext)
+      ParquetTableScan(prunedAttributes, relation, columnPruningPred)
     } else {
       sys.error("Warning: Could not validate Parquet schema projection in pruneColumns")
-      this
     }
   }
 
@@ -122,14 +130,7 @@ case class ParquetTableScan(
   private def validateProjection(projection: Seq[Attribute]): Boolean = {
     val original: MessageType = relation.parquetSchema
     val candidate: MessageType = ParquetTypesConverter.convertFromAttributes(projection)
-    try {
-      original.checkContains(candidate)
-      true
-    } catch {
-      case e: InvalidRecordException => {
-        false
-      }
-    }
+    Try(original.checkContains(candidate)).isSuccess
   }
 }
 
@@ -152,8 +153,7 @@ case class ParquetTableScan(
 case class InsertIntoParquetTable(
     relation: ParquetRelation,
     child: SparkPlan,
-    overwrite: Boolean = false)(
-    @transient val sqlContext: SQLContext)
+    overwrite: Boolean = false)
   extends UnaryNode with SparkHadoopMapReduceUtil {
 
   /**
@@ -173,7 +173,7 @@ case class InsertIntoParquetTable(
 
     val writeSupport =
       if (child.output.map(_.dataType).forall(_.isPrimitive)) {
-        logger.debug("Initializing MutableRowWriteSupport")
+        log.debug("Initializing MutableRowWriteSupport")
         classOf[org.apache.spark.sql.parquet.MutableRowWriteSupport]
       } else {
         classOf[org.apache.spark.sql.parquet.RowWriteSupport]
@@ -205,8 +205,6 @@ case class InsertIntoParquetTable(
 
   override def output = child.output
 
-  override def otherCopyArgs = sqlContext :: Nil
-
   /**
    * Stores the given Row RDD as a Hadoop file.
    *
@@ -302,6 +300,11 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
  */
 private[parquet] class FilteringParquetRowInputFormat
   extends parquet.hadoop.ParquetInputFormat[Row] with Logging {
+
+  private var footers: JList[Footer] = _
+
+  private var fileStatuses= Map.empty[Path, FileStatus]
+
   override def createRecordReader(
       inputSplit: InputSplit,
       taskAttemptContext: TaskAttemptContext): RecordReader[Void, Row] = {
@@ -318,6 +321,70 @@ private[parquet] class FilteringParquetRowInputFormat
       new ParquetRecordReader[Row](readSupport)
     }
   }
+
+  override def getFooters(jobContext: JobContext): JList[Footer] = {
+    if (footers eq null) {
+      val statuses = listStatus(jobContext)
+      fileStatuses = statuses.map(file => file.getPath -> file).toMap
+      footers = getFooters(ContextUtil.getConfiguration(jobContext), statuses)
+    }
+
+    footers
+  }
+
+  // TODO Remove this method and related code once PARQUET-16 is fixed
+  // This method together with the `getFooters` method and the `fileStatuses` field are just used
+  // to mimic this PR: https://github.com/apache/incubator-parquet-mr/pull/17
+  override def getSplits(
+      configuration: Configuration,
+      footers: JList[Footer]): JList[ParquetInputSplit] = {
+
+    val maxSplitSize: JLong = configuration.getLong("mapred.max.split.size", Long.MaxValue)
+    val minSplitSize: JLong =
+      Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L))
+    if (maxSplitSize < 0 || minSplitSize < 0) {
+      throw new ParquetDecodingException(
+        s"maxSplitSize or minSplitSie should not be negative: maxSplitSize = $maxSplitSize;" +
+          s" minSplitSize = $minSplitSize")
+    }
+
+    val getGlobalMetaData =
+      classOf[ParquetFileWriter].getDeclaredMethod("getGlobalMetaData", classOf[JList[Footer]])
+    getGlobalMetaData.setAccessible(true)
+    val globalMetaData = getGlobalMetaData.invoke(null, footers).asInstanceOf[GlobalMetaData]
+
+    val readContext = getReadSupport(configuration).init(
+      new InitContext(configuration,
+        globalMetaData.getKeyValueMetaData(),
+        globalMetaData.getSchema()))
+
+    val generateSplits =
+      classOf[ParquetInputFormat[_]].getDeclaredMethods.find(_.getName == "generateSplits").get
+    generateSplits.setAccessible(true)
+
+    val splits = mutable.ArrayBuffer.empty[ParquetInputSplit]
+    for (footer <- footers) {
+      val fs = footer.getFile.getFileSystem(configuration)
+      val file = footer.getFile
+      val fileStatus = fileStatuses.getOrElse(file, fs.getFileStatus(file))
+      val parquetMetaData = footer.getParquetMetadata
+      val blocks = parquetMetaData.getBlocks
+      val fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen)
+      splits.addAll(
+        generateSplits.invoke(
+          null,
+          blocks,
+          fileBlockLocations,
+          fileStatus,
+          parquetMetaData.getFileMetaData,
+          readContext.getRequestedSchema.toString,
+          readContext.getReadSupportMetadata,
+          minSplitSize,
+          maxSplitSize).asInstanceOf[JList[ParquetInputSplit]])
+    }
+
+    splits
+  }
 }
 
 private[parquet] object FileSystemHelper {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index bfcbdeb34a92f..6d4ce32ac5bfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -17,20 +17,19 @@
 
 package org.apache.spark.sql.parquet
 
-import org.apache.hadoop.conf.Configuration
+import java.util.{HashMap => JHashMap}
 
+import org.apache.hadoop.conf.Configuration
 import parquet.column.ParquetProperties
 import parquet.hadoop.ParquetOutputFormat
 import parquet.hadoop.api.ReadSupport.ReadContext
 import parquet.hadoop.api.{ReadSupport, WriteSupport}
 import parquet.io.api._
-import parquet.schema.{MessageType, MessageTypeParser}
+import parquet.schema.MessageType
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
 import org.apache.spark.sql.catalyst.types._
-import org.apache.spark.sql.execution.SparkSqlSerializer
-import com.google.common.io.BaseEncoding
 
 /**
  * A `parquet.io.api.RecordMaterializer` for Rows.
@@ -93,8 +92,8 @@ private[parquet] class RowReadSupport extends ReadSupport[Row] with Logging {
       configuration: Configuration,
       keyValueMetaData: java.util.Map[String, String],
       fileSchema: MessageType): ReadContext = {
-    var parquetSchema: MessageType = fileSchema
-    var metadata: java.util.Map[String, String] = new java.util.HashMap[String, String]()
+    var parquetSchema = fileSchema
+    val metadata = new JHashMap[String, String]()
     val requestedAttributes = RowReadSupport.getRequestedSchema(configuration)
 
     if (requestedAttributes != null) {
@@ -109,7 +108,7 @@ private[parquet] class RowReadSupport extends ReadSupport[Row] with Logging {
       metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
     }
 
-    return new ReadSupport.ReadContext(parquetSchema, metadata)
+    new ReadSupport.ReadContext(parquetSchema, metadata)
   }
 }
 
@@ -132,13 +131,17 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
   private[parquet] var attributes: Seq[Attribute] = null
 
   override def init(configuration: Configuration): WriteSupport.WriteContext = {
-    attributes = if (attributes == null) RowWriteSupport.getSchema(configuration) else attributes
-    
+    val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
+    val metadata = new JHashMap[String, String]()
+    metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
+
+    if (attributes == null) {
+      attributes = ParquetTypesConverter.convertFromString(origAttributesStr)
+    }
+
     log.debug(s"write support initialized for requested schema $attributes")
     ParquetRelation.enableLogForwarding()
-    new WriteSupport.WriteContext(
-      ParquetTypesConverter.convertFromAttributes(attributes),
-      new java.util.HashMap[java.lang.String, java.lang.String]())
+    new WriteSupport.WriteContext(ParquetTypesConverter.convertFromAttributes(attributes), metadata)
   }
 
   override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
@@ -156,7 +159,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     writer.startMessage()
     while(index < attributes.size) {
       // null values indicate optional fields but we do not check currently
-      if (record(index) != null && record(index) != Nil) {
+      if (record(index) != null) {
         writer.startField(attributes(index).name, index)
         writeValue(attributes(index).dataType, record(index))
         writer.endField(attributes(index).name, index)
@@ -167,12 +170,12 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
   }
 
   private[parquet] def writeValue(schema: DataType, value: Any): Unit = {
-    if (value != null && value != Nil) {
+    if (value != null) {
       schema match {
-        case t @ ArrayType(_) => writeArray(
+        case t @ ArrayType(_, false) => writeArray(
           t,
           value.asInstanceOf[CatalystConverter.ArrayScalaType[_]])
-        case t @ MapType(_, _) => writeMap(
+        case t @ MapType(_, _, _) => writeMap(
           t,
           value.asInstanceOf[CatalystConverter.MapScalaType[_, _]])
         case t @ StructType(_) => writeStruct(
@@ -184,17 +187,19 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
   }
 
   private[parquet] def writePrimitive(schema: PrimitiveType, value: Any): Unit = {
-    if (value != null && value != Nil) {
+    if (value != null) {
       schema match {
         case StringType => writer.addBinary(
           Binary.fromByteArray(
             value.asInstanceOf[String].getBytes("utf-8")
           )
         )
+        case BinaryType => writer.addBinary(
+          Binary.fromByteArray(value.asInstanceOf[Array[Byte]]))
         case IntegerType => writer.addInteger(value.asInstanceOf[Int])
-        case ShortType => writer.addInteger(value.asInstanceOf[Int])
+        case ShortType => writer.addInteger(value.asInstanceOf[Short])
         case LongType => writer.addLong(value.asInstanceOf[Long])
-        case ByteType => writer.addInteger(value.asInstanceOf[Int])
+        case ByteType => writer.addInteger(value.asInstanceOf[Byte])
         case DoubleType => writer.addDouble(value.asInstanceOf[Double])
         case FloatType => writer.addFloat(value.asInstanceOf[Float])
         case BooleanType => writer.addBoolean(value.asInstanceOf[Boolean])
@@ -206,12 +211,12 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
   private[parquet] def writeStruct(
       schema: StructType,
       struct: CatalystConverter.StructScalaType[_]): Unit = {
-    if (struct != null && struct != Nil) {
+    if (struct != null) {
       val fields = schema.fields.toArray
       writer.startGroup()
       var i = 0
       while(i < fields.size) {
-        if (struct(i) != null && struct(i) != Nil) {
+        if (struct(i) != null) {
           writer.startField(fields(i).name, i)
           writeValue(fields(i).dataType, struct(i))
           writer.endField(fields(i).name, i)
@@ -299,6 +304,8 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
           record(index).asInstanceOf[String].getBytes("utf-8")
         )
       )
+      case BinaryType => writer.addBinary(
+        Binary.fromByteArray(record(index).asInstanceOf[Array[Byte]]))
       case IntegerType => writer.addInteger(record.getInt(index))
       case ShortType => writer.addInteger(record.getShort(index))
       case LongType => writer.addLong(record.getLong(index))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
index 1dc58633a2a68..837ea7695dbb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.mapreduce.Job
+import org.apache.spark.sql.test.TestSQLContext
 
 import parquet.example.data.{GroupWriter, Group}
 import parquet.example.data.simple.SimpleGroup
@@ -58,7 +59,7 @@ private[sql] object ParquetTestData {
     """message myrecord {
       optional boolean myboolean;
       optional int32 myint;
-      optional binary mystring;
+      optional binary mystring (UTF8);
       optional int64 mylong;
       optional float myfloat;
       optional double mydouble;
@@ -87,7 +88,7 @@ private[sql] object ParquetTestData {
       message myrecord {
       required boolean myboolean;
       required int32 myint;
-      required binary mystring;
+      required binary mystring (UTF8);
       required int64 mylong;
       required float myfloat;
       required double mydouble;
@@ -103,7 +104,7 @@ private[sql] object ParquetTestData {
   val testDir = Utils.createTempDir()
   val testFilterDir = Utils.createTempDir()
 
-  lazy val testData = new ParquetRelation(testDir.toURI.toString)
+  lazy val testData = new ParquetRelation(testDir.toURI.toString, None, TestSQLContext)
 
   val testNestedSchema1 =
     // based on blogpost example, source:
@@ -119,14 +120,14 @@ private[sql] object ParquetTestData {
     // so that array types can be translated correctly.
     """
       message AddressBook {
-        required binary owner;
+        required binary owner (UTF8);
         optional group ownerPhoneNumbers {
-          repeated binary array;
+          repeated binary array (UTF8);
         }
         optional group contacts {
           repeated group array {
-            required binary name;
-            optional binary phoneNumber;
+            required binary name (UTF8);
+            optional binary phoneNumber (UTF8);
           }
         }
       }
@@ -181,16 +182,16 @@ private[sql] object ParquetTestData {
         required int32 x;
         optional group data1 {
           repeated group map {
-            required binary key;
+            required binary key (UTF8);
             required int32 value;
           }
         }
         required group data2 {
           repeated group map {
-            required binary key;
+            required binary key (UTF8);
             required group value {
               required int64 payload1;
-              optional binary payload2;
+              optional binary payload2 (UTF8);
             }
           }
         }
@@ -202,8 +203,10 @@ private[sql] object ParquetTestData {
   val testNestedDir3 = Utils.createTempDir()
   val testNestedDir4 = Utils.createTempDir()
 
-  lazy val testNestedData1 = new ParquetRelation(testNestedDir1.toURI.toString)
-  lazy val testNestedData2 = new ParquetRelation(testNestedDir2.toURI.toString)
+  lazy val testNestedData1 =
+    new ParquetRelation(testNestedDir1.toURI.toString, None, TestSQLContext)
+  lazy val testNestedData2 =
+    new ParquetRelation(testNestedDir2.toURI.toString, None, TestSQLContext)
 
   def writeFile() = {
     testDir.delete()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index f9046368e7ced..aaef1a1d474fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -22,6 +22,7 @@ import java.io.IOException
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
 import parquet.hadoop.{ParquetFileReader, Footer, ParquetFileWriter}
 import parquet.hadoop.metadata.{ParquetMetadata, FileMetaData}
@@ -42,20 +43,22 @@ private[parquet] object ParquetTypesConverter extends Logging {
   def isPrimitiveType(ctype: DataType): Boolean =
     classOf[PrimitiveType] isAssignableFrom ctype.getClass
 
-  def toPrimitiveDataType(parquetType : ParquetPrimitiveTypeName): DataType = parquetType match {
-    case ParquetPrimitiveTypeName.BINARY => StringType
-    case ParquetPrimitiveTypeName.BOOLEAN => BooleanType
-    case ParquetPrimitiveTypeName.DOUBLE => DoubleType
-    case ParquetPrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => ArrayType(ByteType)
-    case ParquetPrimitiveTypeName.FLOAT => FloatType
-    case ParquetPrimitiveTypeName.INT32 => IntegerType
-    case ParquetPrimitiveTypeName.INT64 => LongType
-    case ParquetPrimitiveTypeName.INT96 =>
-      // TODO: add BigInteger type? TODO(andre) use DecimalType instead????
-      sys.error("Potential loss of precision: cannot convert INT96")
-    case _ => sys.error(
-      s"Unsupported parquet datatype $parquetType")
-  }
+  def toPrimitiveDataType(parquetType: ParquetPrimitiveType): DataType =
+    parquetType.getPrimitiveTypeName match {
+      case ParquetPrimitiveTypeName.BINARY
+        if parquetType.getOriginalType == ParquetOriginalType.UTF8 => StringType
+      case ParquetPrimitiveTypeName.BINARY => BinaryType
+      case ParquetPrimitiveTypeName.BOOLEAN => BooleanType
+      case ParquetPrimitiveTypeName.DOUBLE => DoubleType
+      case ParquetPrimitiveTypeName.FLOAT => FloatType
+      case ParquetPrimitiveTypeName.INT32 => IntegerType
+      case ParquetPrimitiveTypeName.INT64 => LongType
+      case ParquetPrimitiveTypeName.INT96 =>
+        // TODO: add BigInteger type? TODO(andre) use DecimalType instead????
+        sys.error("Potential loss of precision: cannot convert INT96")
+      case _ => sys.error(
+        s"Unsupported parquet datatype $parquetType")
+    }
 
   /**
    * Converts a given Parquet `Type` into the corresponding
@@ -104,7 +107,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
     }
 
     if (parquetType.isPrimitive) {
-      toPrimitiveDataType(parquetType.asPrimitiveType.getPrimitiveTypeName)
+      toPrimitiveDataType(parquetType.asPrimitiveType)
     } else {
       val groupType = parquetType.asGroupType()
       parquetType.getOriginalType match {
@@ -113,7 +116,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
         case ParquetOriginalType.LIST => { // TODO: check enums!
           assert(groupType.getFieldCount == 1)
           val field = groupType.getFields.apply(0)
-          new ArrayType(toDataType(field))
+          ArrayType(toDataType(field), containsNull = false)
         }
         case ParquetOriginalType.MAP => {
           assert(
@@ -127,7 +130,9 @@ private[parquet] object ParquetTypesConverter extends Logging {
           assert(keyValueGroup.getFields.apply(0).getRepetition == Repetition.REQUIRED)
           val valueType = toDataType(keyValueGroup.getFields.apply(1))
           assert(keyValueGroup.getFields.apply(1).getRepetition == Repetition.REQUIRED)
-          new MapType(keyType, valueType)
+          // TODO: set valueContainsNull explicitly instead of assuming valueContainsNull is true
+          // at here.
+          MapType(keyType, valueType)
         }
         case _ => {
           // Note: the order of these checks is important!
@@ -137,10 +142,12 @@ private[parquet] object ParquetTypesConverter extends Logging {
             assert(keyValueGroup.getFields.apply(0).getRepetition == Repetition.REQUIRED)
             val valueType = toDataType(keyValueGroup.getFields.apply(1))
             assert(keyValueGroup.getFields.apply(1).getRepetition == Repetition.REQUIRED)
-            new MapType(keyType, valueType)
+            // TODO: set valueContainsNull explicitly instead of assuming valueContainsNull is true
+            // at here.
+            MapType(keyType, valueType)
           } else if (correspondsToArray(groupType)) { // ArrayType
             val elementType = toDataType(groupType.getFields.apply(0))
-            new ArrayType(elementType)
+            ArrayType(elementType, containsNull = false)
           } else { // everything else: StructType
             val fields = groupType
               .getFields
@@ -148,7 +155,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
               ptype.getName,
               toDataType(ptype),
               ptype.getRepetition != Repetition.REQUIRED))
-            new StructType(fields)
+            StructType(fields)
           }
         }
       }
@@ -164,18 +171,17 @@ private[parquet] object ParquetTypesConverter extends Logging {
    * @return The name of the corresponding Parquet primitive type
    */
   def fromPrimitiveDataType(ctype: DataType):
-      Option[ParquetPrimitiveTypeName] = ctype match {
-    case StringType => Some(ParquetPrimitiveTypeName.BINARY)
-    case BooleanType => Some(ParquetPrimitiveTypeName.BOOLEAN)
-    case DoubleType => Some(ParquetPrimitiveTypeName.DOUBLE)
-    case ArrayType(ByteType) =>
-      Some(ParquetPrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
-    case FloatType => Some(ParquetPrimitiveTypeName.FLOAT)
-    case IntegerType => Some(ParquetPrimitiveTypeName.INT32)
+      Option[(ParquetPrimitiveTypeName, Option[ParquetOriginalType])] = ctype match {
+    case StringType => Some(ParquetPrimitiveTypeName.BINARY, Some(ParquetOriginalType.UTF8))
+    case BinaryType => Some(ParquetPrimitiveTypeName.BINARY, None)
+    case BooleanType => Some(ParquetPrimitiveTypeName.BOOLEAN, None)
+    case DoubleType => Some(ParquetPrimitiveTypeName.DOUBLE, None)
+    case FloatType => Some(ParquetPrimitiveTypeName.FLOAT, None)
+    case IntegerType => Some(ParquetPrimitiveTypeName.INT32, None)
     // There is no type for Byte or Short so we promote them to INT32.
-    case ShortType => Some(ParquetPrimitiveTypeName.INT32)
-    case ByteType => Some(ParquetPrimitiveTypeName.INT32)
-    case LongType => Some(ParquetPrimitiveTypeName.INT64)
+    case ShortType => Some(ParquetPrimitiveTypeName.INT32, None)
+    case ByteType => Some(ParquetPrimitiveTypeName.INT32, None)
+    case LongType => Some(ParquetPrimitiveTypeName.INT64, None)
     case _ => None
   }
 
@@ -227,17 +233,18 @@ private[parquet] object ParquetTypesConverter extends Logging {
         if (nullable) Repetition.OPTIONAL else Repetition.REQUIRED
       }
     val primitiveType = fromPrimitiveDataType(ctype)
-    if (primitiveType.isDefined) {
-      new ParquetPrimitiveType(repetition, primitiveType.get, name)
-    } else {
+    primitiveType.map {
+      case (primitiveType, originalType) =>
+        new ParquetPrimitiveType(repetition, primitiveType, name, originalType.orNull)
+    }.getOrElse {
       ctype match {
-        case ArrayType(elementType) => {
+        case ArrayType(elementType, false) => {
           val parquetElementType = fromDataType(
             elementType,
             CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
             nullable = false,
             inArray = true)
-            ConversionPatterns.listType(repetition, name, parquetElementType)
+          ConversionPatterns.listType(repetition, name, parquetElementType)
         }
         case StructType(structFields) => {
           val fields = structFields.map {
@@ -245,7 +252,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
           }
           new ParquetGroupType(repetition, name, fields)
         }
-        case MapType(keyType, valueType) => {
+        case MapType(keyType, valueType, _) => {
           val parquetKeyType =
             fromDataType(
               keyType,
@@ -365,20 +372,24 @@ private[parquet] object ParquetTypesConverter extends Logging {
         s"Expected $path for be a directory with Parquet files/metadata")
     }
     ParquetRelation.enableLogForwarding()
-    val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)
-    // if this is a new table that was just created we will find only the metadata file
-    if (fs.exists(metadataPath) && fs.isFile(metadataPath)) {
-      ParquetFileReader.readFooter(conf, metadataPath)
-    } else {
-      // there may be one or more Parquet files in the given directory
-      val footers = ParquetFileReader.readFooters(conf, fs.getFileStatus(path))
-      // TODO: for now we assume that all footers (if there is more than one) have identical
-      // metadata; we may want to add a check here at some point
-      if (footers.size() == 0) {
-        throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path")
-      }
-      footers(0).getParquetMetadata
+
+    val children = fs.listStatus(path).filterNot {
+      _.getPath.getName == FileOutputCommitter.SUCCEEDED_FILE_NAME
     }
+
+    // NOTE (lian): Parquet "_metadata" file can be very slow if the file consists of lots of row
+    // groups. Since Parquet schema is replicated among all row groups, we only need to touch a
+    // single row group to read schema related metadata. Notice that we are making assumptions that
+    // all data in a single Parquet file have the same schema, which is normally true.
+    children
+      // Try any non-"_metadata" file first...
+      .find(_.getPath.getName != ParquetFileWriter.PARQUET_METADATA_FILE)
+      // ... and fallback to "_metadata" if no such file exists (which implies the Parquet file is
+      // empty, thus normally the "_metadata" file is expected to be fairly small).
+      .orElse(children.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE))
+      .map(ParquetFileReader.readFooter(conf, _))
+      .getOrElse(
+        throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path"))
   }
 
   /**
@@ -401,7 +412,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
     } else {
       val attributes = convertToAttributes(
         readMetaData(origPath, conf).getFileMetaData.getSchema)
-      log.warn(s"Falling back to schema conversion from Parquet types; result: $attributes")
+      log.info(s"Falling back to schema conversion from Parquet types; result: $attributes")
       attributes
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/types/util/DataTypeConversions.scala b/sql/core/src/main/scala/org/apache/spark/sql/types/util/DataTypeConversions.scala
new file mode 100644
index 0000000000000..77353f4eb0227
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/types/util/DataTypeConversions.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types.util
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.api.java.{DataType => JDataType, StructField => JStructField}
+
+import scala.collection.JavaConverters._
+
+protected[sql] object DataTypeConversions {
+
+  /**
+   * Returns the equivalent StructField in Scala for the given StructField in Java.
+   */
+  def asJavaStructField(scalaStructField: StructField): JStructField = {
+    JDataType.createStructField(
+      scalaStructField.name,
+      asJavaDataType(scalaStructField.dataType),
+      scalaStructField.nullable)
+  }
+
+  /**
+   * Returns the equivalent DataType in Java for the given DataType in Scala.
+   */
+  def asJavaDataType(scalaDataType: DataType): JDataType = scalaDataType match {
+    case StringType => JDataType.StringType
+    case BinaryType => JDataType.BinaryType
+    case BooleanType => JDataType.BooleanType
+    case TimestampType => JDataType.TimestampType
+    case DecimalType => JDataType.DecimalType
+    case DoubleType => JDataType.DoubleType
+    case FloatType => JDataType.FloatType
+    case ByteType => JDataType.ByteType
+    case IntegerType => JDataType.IntegerType
+    case LongType => JDataType.LongType
+    case ShortType => JDataType.ShortType
+
+    case arrayType: ArrayType => JDataType.createArrayType(
+        asJavaDataType(arrayType.elementType), arrayType.containsNull)
+    case mapType: MapType => JDataType.createMapType(
+        asJavaDataType(mapType.keyType),
+        asJavaDataType(mapType.valueType),
+        mapType.valueContainsNull)
+    case structType: StructType => JDataType.createStructType(
+        structType.fields.map(asJavaStructField).asJava)
+  }
+
+  /**
+   * Returns the equivalent StructField in Scala for the given StructField in Java.
+   */
+  def asScalaStructField(javaStructField: JStructField): StructField = {
+    StructField(
+      javaStructField.getName,
+      asScalaDataType(javaStructField.getDataType),
+      javaStructField.isNullable)
+  }
+
+  /**
+   * Returns the equivalent DataType in Scala for the given DataType in Java.
+   */
+  def asScalaDataType(javaDataType: JDataType): DataType = javaDataType match {
+    case stringType: org.apache.spark.sql.api.java.StringType =>
+      StringType
+    case binaryType: org.apache.spark.sql.api.java.BinaryType =>
+      BinaryType
+    case booleanType: org.apache.spark.sql.api.java.BooleanType =>
+      BooleanType
+    case timestampType: org.apache.spark.sql.api.java.TimestampType =>
+      TimestampType
+    case decimalType: org.apache.spark.sql.api.java.DecimalType =>
+      DecimalType
+    case doubleType: org.apache.spark.sql.api.java.DoubleType =>
+      DoubleType
+    case floatType: org.apache.spark.sql.api.java.FloatType =>
+      FloatType
+    case byteType: org.apache.spark.sql.api.java.ByteType =>
+      ByteType
+    case integerType: org.apache.spark.sql.api.java.IntegerType =>
+      IntegerType
+    case longType: org.apache.spark.sql.api.java.LongType =>
+      LongType
+    case shortType: org.apache.spark.sql.api.java.ShortType =>
+      ShortType
+
+    case arrayType: org.apache.spark.sql.api.java.ArrayType =>
+      ArrayType(asScalaDataType(arrayType.getElementType), arrayType.isContainsNull)
+    case mapType: org.apache.spark.sql.api.java.MapType =>
+      MapType(
+        asScalaDataType(mapType.getKeyType),
+        asScalaDataType(mapType.getValueType),
+        mapType.isValueContainsNull)
+    case structType: org.apache.spark.sql.api.java.StructType =>
+      StructType(structType.getFields.map(asScalaStructField))
+  }
+}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
new file mode 100644
index 0000000000000..a9a11285def54
--- /dev/null
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+import org.apache.spark.sql.api.java.UDF1;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runners.Suite;
+import org.junit.runner.RunWith;
+
+import org.apache.spark.api.java.JavaSparkContext;
+
+// The test suite itself is Serializable so that anonymous Function implementations can be
+// serialized, as an alternative to converting these anonymous classes to static inner classes;
+// see http://stackoverflow.com/questions/758570/.
+public class JavaAPISuite implements Serializable {
+  private transient JavaSparkContext sc;
+  private transient JavaSQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaAPISuite");
+    sqlContext = new JavaSQLContext(sc);
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf1Test() {
+    // With Java 8 lambdas:
+    // sqlContext.registerFunction(
+    //   "stringLengthTest", (String str) -> str.length(), DataType.IntegerType);
+
+    sqlContext.registerFunction("stringLengthTest", new UDF1<String, Integer>() {
+      @Override
+      public Integer call(String str) throws Exception {
+        return str.length();
+      }
+    }, DataType.IntegerType);
+
+    // TODO: Why do we need this cast?
+    Row result = (Row) sqlContext.sql("SELECT stringLengthTest('test')").first();
+    assert(result.getInt(0) == 4);
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf2Test() {
+    // With Java 8 lambdas:
+    // sqlContext.registerFunction(
+    //   "stringLengthTest",
+    //   (String str1, String str2) -> str1.length() + str2.length,
+    //   DataType.IntegerType);
+
+    sqlContext.registerFunction("stringLengthTest", new UDF2<String, String, Integer>() {
+      @Override
+      public Integer call(String str1, String str2) throws Exception {
+        return str1.length() + str2.length();
+      }
+    }, DataType.IntegerType);
+
+    // TODO: Why do we need this cast?
+    Row result = (Row) sqlContext.sql("SELECT stringLengthTest('test', 'test2')").first();
+    assert(result.getInt(0) == 9);
+  }
+}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
new file mode 100644
index 0000000000000..33e5020bc636a
--- /dev/null
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+
+// The test suite itself is Serializable so that anonymous Function implementations can be
+// serialized, as an alternative to converting these anonymous classes to static inner classes;
+// see http://stackoverflow.com/questions/758570/.
+public class JavaApplySchemaSuite implements Serializable {
+  private transient JavaSparkContext javaCtx;
+  private transient JavaSQLContext javaSqlCtx;
+
+  @Before
+  public void setUp() {
+    javaCtx = new JavaSparkContext("local", "JavaApplySchemaSuite");
+    javaSqlCtx = new JavaSQLContext(javaCtx);
+  }
+
+  @After
+  public void tearDown() {
+    javaCtx.stop();
+    javaCtx = null;
+    javaSqlCtx = null;
+  }
+
+  public static class Person implements Serializable {
+    private String name;
+    private int age;
+
+    public String getName() {
+      return name;
+    }
+
+    public void setName(String name) {
+      this.name = name;
+    }
+
+    public int getAge() {
+      return age;
+    }
+
+    public void setAge(int age) {
+      this.age = age;
+    }
+  }
+
+  @Test
+  public void applySchema() {
+    List<Person> personList = new ArrayList<Person>(2);
+    Person person1 = new Person();
+    person1.setName("Michael");
+    person1.setAge(29);
+    personList.add(person1);
+    Person person2 = new Person();
+    person2.setName("Yin");
+    person2.setAge(28);
+    personList.add(person2);
+
+    JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
+      new Function<Person, Row>() {
+        public Row call(Person person) throws Exception {
+          return Row.create(person.getName(), person.getAge());
+        }
+      });
+
+    List<StructField> fields = new ArrayList<StructField>(2);
+    fields.add(DataType.createStructField("name", DataType.StringType, false));
+    fields.add(DataType.createStructField("age", DataType.IntegerType, false));
+    StructType schema = DataType.createStructType(fields);
+
+    JavaSchemaRDD schemaRDD = javaSqlCtx.applySchema(rowRDD, schema);
+    schemaRDD.registerTempTable("people");
+    List<Row> actual = javaSqlCtx.sql("SELECT * FROM people").collect();
+
+    List<Row> expected = new ArrayList<Row>(2);
+    expected.add(Row.create("Michael", 29));
+    expected.add(Row.create("Yin", 28));
+
+    Assert.assertEquals(expected, actual);
+  }
+
+  @Test
+  public void applySchemaToJSON() {
+    JavaRDD<String> jsonRDD = javaCtx.parallelize(Arrays.asList(
+      "{\"string\":\"this is a simple string.\", \"integer\":10, \"long\":21474836470, " +
+        "\"bigInteger\":92233720368547758070, \"double\":1.7976931348623157E308, " +
+        "\"boolean\":true, \"null\":null}",
+      "{\"string\":\"this is another simple string.\", \"integer\":11, \"long\":21474836469, " +
+        "\"bigInteger\":92233720368547758069, \"double\":1.7976931348623157E305, " +
+        "\"boolean\":false, \"null\":null}"));
+    List<StructField> fields = new ArrayList<StructField>(7);
+    fields.add(DataType.createStructField("bigInteger", DataType.DecimalType, true));
+    fields.add(DataType.createStructField("boolean", DataType.BooleanType, true));
+    fields.add(DataType.createStructField("double", DataType.DoubleType, true));
+    fields.add(DataType.createStructField("integer", DataType.IntegerType, true));
+    fields.add(DataType.createStructField("long", DataType.LongType, true));
+    fields.add(DataType.createStructField("null", DataType.StringType, true));
+    fields.add(DataType.createStructField("string", DataType.StringType, true));
+    StructType expectedSchema = DataType.createStructType(fields);
+    List<Row> expectedResult = new ArrayList<Row>(2);
+    expectedResult.add(
+      Row.create(
+        new BigDecimal("92233720368547758070"),
+        true,
+        1.7976931348623157E308,
+        10,
+        21474836470L,
+        null,
+        "this is a simple string."));
+    expectedResult.add(
+      Row.create(
+        new BigDecimal("92233720368547758069"),
+        false,
+        1.7976931348623157E305,
+        11,
+        21474836469L,
+        null,
+        "this is another simple string."));
+
+    JavaSchemaRDD schemaRDD1 = javaSqlCtx.jsonRDD(jsonRDD);
+    StructType actualSchema1 = schemaRDD1.schema();
+    Assert.assertEquals(expectedSchema, actualSchema1);
+    schemaRDD1.registerTempTable("jsonTable1");
+    List<Row> actual1 = javaSqlCtx.sql("select * from jsonTable1").collect();
+    Assert.assertEquals(expectedResult, actual1);
+
+    JavaSchemaRDD schemaRDD2 = javaSqlCtx.jsonRDD(jsonRDD, expectedSchema);
+    StructType actualSchema2 = schemaRDD2.schema();
+    Assert.assertEquals(expectedSchema, actualSchema2);
+    schemaRDD1.registerTempTable("jsonTable2");
+    List<Row> actual2 = javaSqlCtx.sql("select * from jsonTable2").collect();
+    Assert.assertEquals(expectedResult, actual2);
+  }
+}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
new file mode 100644
index 0000000000000..52d07b5425cc3
--- /dev/null
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.math.BigDecimal;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class JavaRowSuite {
+  private byte byteValue;
+  private short shortValue;
+  private int intValue;
+  private long longValue;
+  private float floatValue;
+  private double doubleValue;
+  private BigDecimal decimalValue;
+  private boolean booleanValue;
+  private String stringValue;
+  private byte[] binaryValue;
+  private Timestamp timestampValue;
+
+  @Before
+  public void setUp() {
+    byteValue = (byte)127;
+    shortValue = (short)32767;
+    intValue = 2147483647;
+    longValue = 9223372036854775807L;
+    floatValue = (float)3.4028235E38;
+    doubleValue = 1.7976931348623157E308;
+    decimalValue = new BigDecimal("1.7976931348623157E328");
+    booleanValue = true;
+    stringValue = "this is a string";
+    binaryValue = stringValue.getBytes();
+    timestampValue = Timestamp.valueOf("2014-06-30 09:20:00.0");
+  }
+
+  @Test
+  public void constructSimpleRow() {
+    Row simpleRow = Row.create(
+      byteValue,                 // ByteType
+      new Byte(byteValue),
+      shortValue,                // ShortType
+      new Short(shortValue),
+      intValue,                  // IntegerType
+      new Integer(intValue),
+      longValue,                 // LongType
+      new Long(longValue),
+      floatValue,                // FloatType
+      new Float(floatValue),
+      doubleValue,               // DoubleType
+      new Double(doubleValue),
+      decimalValue,              // DecimalType
+      booleanValue,              // BooleanType
+      new Boolean(booleanValue),
+      stringValue,               // StringType
+      binaryValue,               // BinaryType
+      timestampValue,            // TimestampType
+      null                       // null
+    );
+
+    Assert.assertEquals(byteValue, simpleRow.getByte(0));
+    Assert.assertEquals(byteValue, simpleRow.get(0));
+    Assert.assertEquals(byteValue, simpleRow.getByte(1));
+    Assert.assertEquals(byteValue, simpleRow.get(1));
+    Assert.assertEquals(shortValue, simpleRow.getShort(2));
+    Assert.assertEquals(shortValue, simpleRow.get(2));
+    Assert.assertEquals(shortValue, simpleRow.getShort(3));
+    Assert.assertEquals(shortValue, simpleRow.get(3));
+    Assert.assertEquals(intValue, simpleRow.getInt(4));
+    Assert.assertEquals(intValue, simpleRow.get(4));
+    Assert.assertEquals(intValue, simpleRow.getInt(5));
+    Assert.assertEquals(intValue, simpleRow.get(5));
+    Assert.assertEquals(longValue, simpleRow.getLong(6));
+    Assert.assertEquals(longValue, simpleRow.get(6));
+    Assert.assertEquals(longValue, simpleRow.getLong(7));
+    Assert.assertEquals(longValue, simpleRow.get(7));
+    // When we create the row, we do not do any conversion
+    // for a float/double value, so we just set the delta to 0.
+    Assert.assertEquals(floatValue, simpleRow.getFloat(8), 0);
+    Assert.assertEquals(floatValue, simpleRow.get(8));
+    Assert.assertEquals(floatValue, simpleRow.getFloat(9), 0);
+    Assert.assertEquals(floatValue, simpleRow.get(9));
+    Assert.assertEquals(doubleValue, simpleRow.getDouble(10), 0);
+    Assert.assertEquals(doubleValue, simpleRow.get(10));
+    Assert.assertEquals(doubleValue, simpleRow.getDouble(11), 0);
+    Assert.assertEquals(doubleValue, simpleRow.get(11));
+    Assert.assertEquals(decimalValue, simpleRow.get(12));
+    Assert.assertEquals(booleanValue, simpleRow.getBoolean(13));
+    Assert.assertEquals(booleanValue, simpleRow.get(13));
+    Assert.assertEquals(booleanValue, simpleRow.getBoolean(14));
+    Assert.assertEquals(booleanValue, simpleRow.get(14));
+    Assert.assertEquals(stringValue, simpleRow.getString(15));
+    Assert.assertEquals(stringValue, simpleRow.get(15));
+    Assert.assertEquals(binaryValue, simpleRow.get(16));
+    Assert.assertEquals(timestampValue, simpleRow.get(17));
+    Assert.assertEquals(true, simpleRow.isNullAt(18));
+    Assert.assertEquals(null, simpleRow.get(18));
+  }
+
+  @Test
+  public void constructComplexRow() {
+    // Simple array
+    List<String> simpleStringArray = Arrays.asList(
+      stringValue + " (1)", stringValue + " (2)", stringValue + "(3)");
+
+    // Simple map
+    Map<String, Long> simpleMap = new HashMap<String, Long>();
+    simpleMap.put(stringValue + " (1)", longValue);
+    simpleMap.put(stringValue + " (2)", longValue - 1);
+    simpleMap.put(stringValue + " (3)", longValue - 2);
+
+    // Simple struct
+    Row simpleStruct = Row.create(
+      doubleValue, stringValue, timestampValue, null);
+
+    // Complex array
+    List<Map<String, Long>> arrayOfMaps = Arrays.asList(simpleMap);
+    List<Row> arrayOfRows = Arrays.asList(simpleStruct);
+
+    // Complex map
+    Map<List<Row>, Row> complexMap = new HashMap<List<Row>, Row>();
+    complexMap.put(arrayOfRows, simpleStruct);
+
+    // Complex struct
+    Row complexStruct = Row.create(
+      simpleStringArray,
+      simpleMap,
+      simpleStruct,
+      arrayOfMaps,
+      arrayOfRows,
+      complexMap,
+      null);
+    Assert.assertEquals(simpleStringArray, complexStruct.get(0));
+    Assert.assertEquals(simpleMap, complexStruct.get(1));
+    Assert.assertEquals(simpleStruct, complexStruct.get(2));
+    Assert.assertEquals(arrayOfMaps, complexStruct.get(3));
+    Assert.assertEquals(arrayOfRows, complexStruct.get(4));
+    Assert.assertEquals(complexMap, complexStruct.get(5));
+    Assert.assertEquals(null, complexStruct.get(6));
+
+    // A very complex row
+    Row complexRow = Row.create(arrayOfMaps, arrayOfRows, complexMap, complexStruct);
+    Assert.assertEquals(arrayOfMaps, complexRow.get(0));
+    Assert.assertEquals(arrayOfRows, complexRow.get(1));
+    Assert.assertEquals(complexMap, complexRow.get(2));
+    Assert.assertEquals(complexStruct, complexRow.get(3));
+  }
+}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaSideDataTypeConversionSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaSideDataTypeConversionSuite.java
new file mode 100644
index 0000000000000..d099a48a1f4b6
--- /dev/null
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaSideDataTypeConversionSuite.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.util.List;
+import java.util.ArrayList;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.spark.sql.types.util.DataTypeConversions;
+
+public class JavaSideDataTypeConversionSuite {
+  public void checkDataType(DataType javaDataType) {
+    org.apache.spark.sql.catalyst.types.DataType scalaDataType =
+      DataTypeConversions.asScalaDataType(javaDataType);
+    DataType actual = DataTypeConversions.asJavaDataType(scalaDataType);
+    Assert.assertEquals(javaDataType, actual);
+  }
+
+  @Test
+  public void createDataTypes() {
+    // Simple DataTypes.
+    checkDataType(DataType.StringType);
+    checkDataType(DataType.BinaryType);
+    checkDataType(DataType.BooleanType);
+    checkDataType(DataType.TimestampType);
+    checkDataType(DataType.DecimalType);
+    checkDataType(DataType.DoubleType);
+    checkDataType(DataType.FloatType);
+    checkDataType(DataType.ByteType);
+    checkDataType(DataType.IntegerType);
+    checkDataType(DataType.LongType);
+    checkDataType(DataType.ShortType);
+
+    // Simple ArrayType.
+    DataType simpleJavaArrayType = DataType.createArrayType(DataType.StringType, true);
+    checkDataType(simpleJavaArrayType);
+
+    // Simple MapType.
+    DataType simpleJavaMapType = DataType.createMapType(DataType.StringType, DataType.LongType);
+    checkDataType(simpleJavaMapType);
+
+    // Simple StructType.
+    List<StructField> simpleFields = new ArrayList<StructField>();
+    simpleFields.add(DataType.createStructField("a", DataType.DecimalType, false));
+    simpleFields.add(DataType.createStructField("b", DataType.BooleanType, true));
+    simpleFields.add(DataType.createStructField("c", DataType.LongType, true));
+    simpleFields.add(DataType.createStructField("d", DataType.BinaryType, false));
+    DataType simpleJavaStructType = DataType.createStructType(simpleFields);
+    checkDataType(simpleJavaStructType);
+
+    // Complex StructType.
+    List<StructField> complexFields = new ArrayList<StructField>();
+    complexFields.add(DataType.createStructField("simpleArray", simpleJavaArrayType, true));
+    complexFields.add(DataType.createStructField("simpleMap", simpleJavaMapType, true));
+    complexFields.add(DataType.createStructField("simpleStruct", simpleJavaStructType, true));
+    complexFields.add(DataType.createStructField("boolean", DataType.BooleanType, false));
+    DataType complexJavaStructType = DataType.createStructType(complexFields);
+    checkDataType(complexJavaStructType);
+
+    // Complex ArrayType.
+    DataType complexJavaArrayType = DataType.createArrayType(complexJavaStructType, true);
+    checkDataType(complexJavaArrayType);
+
+    // Complex MapType.
+    DataType complexJavaMapType =
+      DataType.createMapType(complexJavaStructType, complexJavaArrayType, false);
+    checkDataType(complexJavaMapType);
+  }
+
+  @Test
+  public void illegalArgument() {
+    // ArrayType
+    try {
+      DataType.createArrayType(null, true);
+      Assert.fail();
+    } catch (IllegalArgumentException expectedException) {
+    }
+
+    // MapType
+    try {
+      DataType.createMapType(null, DataType.StringType);
+      Assert.fail();
+    } catch (IllegalArgumentException expectedException) {
+    }
+    try {
+      DataType.createMapType(DataType.StringType, null);
+      Assert.fail();
+    } catch (IllegalArgumentException expectedException) {
+    }
+    try {
+      DataType.createMapType(null, null);
+      Assert.fail();
+    } catch (IllegalArgumentException expectedException) {
+    }
+
+    // StructField
+    try {
+      DataType.createStructField(null, DataType.StringType, true);
+    } catch (IllegalArgumentException expectedException) {
+    }
+    try {
+      DataType.createStructField("name", null, true);
+    } catch (IllegalArgumentException expectedException) {
+    }
+    try {
+      DataType.createStructField(null, null, true);
+    } catch (IllegalArgumentException expectedException) {
+    }
+
+    // StructType
+    try {
+      List<StructField> simpleFields = new ArrayList<StructField>();
+      simpleFields.add(DataType.createStructField("a", DataType.DecimalType, false));
+      simpleFields.add(DataType.createStructField("b", DataType.BooleanType, true));
+      simpleFields.add(DataType.createStructField("c", DataType.LongType, true));
+      simpleFields.add(null);
+      DataType.createStructType(simpleFields);
+      Assert.fail();
+    } catch (IllegalArgumentException expectedException) {
+    }
+    try {
+      List<StructField> simpleFields = new ArrayList<StructField>();
+      simpleFields.add(DataType.createStructField("a", DataType.DecimalType, false));
+      simpleFields.add(DataType.createStructField("a", DataType.BooleanType, true));
+      simpleFields.add(DataType.createStructField("c", DataType.LongType, true));
+      DataType.createStructType(simpleFields);
+      Assert.fail();
+    } catch (IllegalArgumentException expectedException) {
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index c3c0dcb1aa00b..fbf9bd9dbcdea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -78,7 +78,7 @@ class CachedTableSuite extends QueryTest {
   }
 
   test("SELECT Star Cached Table") {
-    TestSQLContext.sql("SELECT * FROM testData").registerAsTable("selectStar")
+    TestSQLContext.sql("SELECT * FROM testData").registerTempTable("selectStar")
     TestSQLContext.cacheTable("selectStar")
     TestSQLContext.sql("SELECT * FROM selectStar WHERE key = 1").collect()
     TestSQLContext.uncacheTable("selectStar")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataTypeSuite.scala
new file mode 100644
index 0000000000000..cf7d79f42db1d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataTypeSuite.scala
@@ -0,0 +1,58 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.scalatest.FunSuite
+
+class DataTypeSuite extends FunSuite {
+
+  test("construct an ArrayType") {
+    val array = ArrayType(StringType)
+
+    assert(ArrayType(StringType, false) === array)
+  }
+
+  test("construct an MapType") {
+    val map = MapType(StringType, IntegerType)
+
+    assert(MapType(StringType, IntegerType, true) === map)
+  }
+
+  test("extract fields from a StructType") {
+    val struct = StructType(
+      StructField("a", IntegerType, true) ::
+      StructField("b", LongType, false) ::
+      StructField("c", StringType, true) ::
+      StructField("d", FloatType, true) :: Nil)
+
+    assert(StructField("b", LongType, false) === struct("b"))
+
+    intercept[IllegalArgumentException] {
+      struct("e")
+    }
+
+    val expectedStruct = StructType(
+      StructField("b", LongType, false) ::
+      StructField("d", FloatType, true) :: Nil)
+
+    assert(expectedStruct === struct(Set("b", "d")))
+    intercept[IllegalArgumentException] {
+      struct(Set("b", "d", "e", "f"))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
index 04ac008682f5f..1a6a6c17473a3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.test._
 
 /* Implicits */
@@ -33,17 +32,23 @@ class DslQuerySuite extends QueryTest {
       testData.collect().toSeq)
   }
 
+  test("repartition") {
+    checkAnswer(
+      testData.select('key).repartition(10).select('key),
+      testData.select('key).collect().toSeq)
+  }
+
   test("agg") {
     checkAnswer(
-      testData2.groupBy('a)('a, Sum('b)),
+      testData2.groupBy('a)('a, sum('b)),
       Seq((1,3),(2,3),(3,3))
     )
     checkAnswer(
-      testData2.groupBy('a)('a, Sum('b) as 'totB).aggregate(Sum('totB)),
+      testData2.groupBy('a)('a, sum('b) as 'totB).aggregate(sum('totB)),
       9
     )
     checkAnswer(
-      testData2.aggregate(Sum('b)),
+      testData2.aggregate(sum('b)),
       9
     )
   }
@@ -98,19 +103,19 @@ class DslQuerySuite extends QueryTest {
       Seq((3,1), (3,2), (2,1), (2,2), (1,1), (1,2)))
 
     checkAnswer(
-      arrayData.orderBy(GetItem('data, 0).asc),
+      arrayData.orderBy('data.getItem(0).asc),
       arrayData.collect().sortBy(_.data(0)).toSeq)
 
     checkAnswer(
-      arrayData.orderBy(GetItem('data, 0).desc),
+      arrayData.orderBy('data.getItem(0).desc),
       arrayData.collect().sortBy(_.data(0)).reverse.toSeq)
 
     checkAnswer(
-      mapData.orderBy(GetItem('data, 1).asc),
+      mapData.orderBy('data.getItem(1).asc),
       mapData.collect().sortBy(_.data(1)).toSeq)
 
     checkAnswer(
-      mapData.orderBy(GetItem('data, 1).desc),
+      mapData.orderBy('data.getItem(1).desc),
       mapData.collect().sortBy(_.data(1)).reverse.toSeq)
   }
 
@@ -168,4 +173,25 @@ class DslQuerySuite extends QueryTest {
   test("zero count") {
     assert(emptyTableData.count() === 0)
   }
+
+  test("except") {
+    checkAnswer(
+      lowerCaseData.except(upperCaseData),
+      (1, "a") ::
+      (2, "b") ::
+      (3, "c") ::
+      (4, "d") :: Nil)
+    checkAnswer(lowerCaseData.except(lowerCaseData), Nil)
+    checkAnswer(upperCaseData.except(upperCaseData), Nil)
+  }
+
+  test("intersect") {
+    checkAnswer(
+      lowerCaseData.intersect(lowerCaseData),
+      (1, "a") ::
+      (2, "b") ::
+      (3, "c") ::
+      (4, "d") :: Nil)
+    checkAnswer(lowerCaseData.intersect(upperCaseData), Nil)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
index 4f0b85f26254b..c87d762751e6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.io.File
+import _root_.java.io.File
 
 /* Implicits */
 import org.apache.spark.sql.test.TestSQLContext._
@@ -31,7 +31,7 @@ class InsertIntoSuite extends QueryTest {
     testFilePath.delete()
     testFilePath.deleteOnExit()
     val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-    testFile.registerAsTable("createAndInsertTest")
+    testFile.registerTempTable("createAndInsertTest")
 
     // Add some data.
     testData.insertInto("createAndInsertTest")
@@ -86,7 +86,7 @@ class InsertIntoSuite extends QueryTest {
     testFilePath.delete()
     testFilePath.deleteOnExit()
     val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-    testFile.registerAsTable("createAndInsertSQLTest")
+    testFile.registerTempTable("createAndInsertSQLTest")
 
     sql("INSERT INTO createAndInsertSQLTest SELECT * FROM testData")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 054b14f8f7ffa..6c7697ece8c56 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -17,13 +17,17 @@
 
 package org.apache.spark.sql
 
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.catalyst.plans.{LeftOuter, RightOuter, FullOuter, Inner}
+import org.apache.spark.sql.catalyst.plans.JoinType
+import org.apache.spark.sql.catalyst.plans.{LeftOuter, RightOuter, FullOuter, Inner, LeftSemi}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
 
-class JoinSuite extends QueryTest {
+class JoinSuite extends QueryTest with BeforeAndAfterEach {
 
   // Ensures tables are loaded.
   TestData
@@ -36,34 +40,54 @@ class JoinSuite extends QueryTest {
     assert(planned.size === 1)
   }
 
-  test("plans broadcast hash join, given hints") {
-
-    def mkTest(buildSide: BuildSide, leftTable: String, rightTable: String) = {
-      TestSQLContext.settings.synchronized {
-        TestSQLContext.set("spark.sql.join.broadcastTables",
-          s"${if (buildSide == BuildRight) rightTable else leftTable}")
-        val rdd = sql( s"""SELECT * FROM $leftTable JOIN $rightTable ON key = a""")
-        // Using `sparkPlan` because for relevant patterns in HashJoin to be
-        // matched, other strategies need to be applied.
-        val physical = rdd.queryExecution.sparkPlan
-        val bhj = physical.collect { case j: BroadcastHashJoin if j.buildSide == buildSide => j}
-
-        assert(bhj.size === 1, "planner does not pick up hint to generate broadcast hash join")
-        checkAnswer(
-          rdd,
-          Seq(
-            (1, "1", 1, 1),
-            (1, "1", 1, 2),
-            (2, "2", 2, 1),
-            (2, "2", 2, 2),
-            (3, "3", 3, 1),
-            (3, "3", 3, 2)
-          ))
+  test("join operator selection") {
+    def assertJoin(sqlString: String, c: Class[_]): Any = {
+      val rdd = sql(sqlString)
+      val physical = rdd.queryExecution.sparkPlan
+      val operators = physical.collect {
+        case j: ShuffledHashJoin => j
+        case j: HashOuterJoin => j
+        case j: LeftSemiJoinHash => j
+        case j: BroadcastHashJoin => j
+        case j: LeftSemiJoinBNL => j
+        case j: CartesianProduct => j
+        case j: BroadcastNestedLoopJoin => j
+      }
+
+      assert(operators.size === 1)
+      if (operators(0).getClass() != c) {
+        fail(s"$sqlString expected operator: $c, but got ${operators(0)}\n physical: \n$physical")
       }
     }
 
-    mkTest(BuildRight, "testData", "testData2")
-    mkTest(BuildLeft, "testData", "testData2")
+    val cases1 = Seq(
+      ("SELECT * FROM testData left semi join testData2 ON key = a", classOf[LeftSemiJoinHash]),
+      ("SELECT * FROM testData left semi join testData2", classOf[LeftSemiJoinBNL]),
+      ("SELECT * FROM testData join testData2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData join testData2 where key=2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData left join testData2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData right join testData2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData full outer join testData2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData left join testData2 where key=2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData right join testData2 where key=2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData full outer join testData2 where key=2", classOf[CartesianProduct]),
+      ("SELECT * FROM testData join testData2 where key>a", classOf[CartesianProduct]),
+      ("SELECT * FROM testData full outer join testData2 where key>a", classOf[CartesianProduct]),
+      ("SELECT * FROM testData join testData2 ON key = a", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData join testData2 ON key = a and key=2", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData join testData2 ON key = a where key=2", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData left join testData2 ON key = a", classOf[HashOuterJoin]),
+      ("SELECT * FROM testData right join testData2 ON key = a where key=2", 
+        classOf[HashOuterJoin]),
+      ("SELECT * FROM testData right join testData2 ON key = a and key=2", 
+        classOf[HashOuterJoin]),
+      ("SELECT * FROM testData full outer join testData2 ON key = a", classOf[HashOuterJoin]),
+      ("SELECT * FROM testData join testData2 ON key = a", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData join testData2 ON key = a and key=2", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData join testData2 ON key = a where key=2", classOf[ShuffledHashJoin])
+    // TODO add BroadcastNestedLoopJoin
+    )
+    cases1.foreach { c => assertJoin(c._1, c._2) }
   }
 
   test("multiple-key equi-join is hash-join") {
@@ -146,6 +170,58 @@ class JoinSuite extends QueryTest {
       (4, "D", 4, "d") ::
       (5, "E", null, null) ::
       (6, "F", null, null) :: Nil)
+    
+    checkAnswer(
+      upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'n > 1)),
+      (1, "A", null, null) ::
+      (2, "B", 2, "b") ::
+      (3, "C", 3, "c") ::
+      (4, "D", 4, "d") ::
+      (5, "E", null, null) ::
+      (6, "F", null, null) :: Nil)
+    
+    checkAnswer(
+      upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'N > 1)),
+      (1, "A", null, null) ::
+      (2, "B", 2, "b") ::
+      (3, "C", 3, "c") ::
+      (4, "D", 4, "d") ::
+      (5, "E", null, null) ::
+      (6, "F", null, null) :: Nil)
+    
+    checkAnswer(
+      upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'l > 'L)),
+      (1, "A", 1, "a") ::
+      (2, "B", 2, "b") ::
+      (3, "C", 3, "c") ::
+      (4, "D", 4, "d") ::
+      (5, "E", null, null) ::
+      (6, "F", null, null) :: Nil)
+
+    // Make sure we are choosing left.outputPartitioning as the
+    // outputPartitioning for the outer join operator.
+    checkAnswer(
+      sql(
+        """
+          |SELECT l.N, count(*)
+          |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a)
+          |GROUP BY l.N
+        """.stripMargin),
+      (1, 1) ::
+      (2, 1) ::
+      (3, 1) ::
+      (4, 1) ::
+      (5, 1) ::
+      (6, 1) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT r.a, count(*)
+          |FROM upperCaseData l LEFT OUTER JOIN allNulls r ON (l.N = r.a)
+          |GROUP BY r.a
+        """.stripMargin),
+      (null, 6) :: Nil)
   }
 
   test("right outer join") {
@@ -157,11 +233,63 @@ class JoinSuite extends QueryTest {
       (4, "d", 4, "D") ::
       (null, null, 5, "E") ::
       (null, null, 6, "F") :: Nil)
+    checkAnswer(
+      lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'n > 1)),
+      (null, null, 1, "A") ::
+      (2, "b", 2, "B") ::
+      (3, "c", 3, "C") ::
+      (4, "d", 4, "D") ::
+      (null, null, 5, "E") ::
+      (null, null, 6, "F") :: Nil)
+    checkAnswer(
+      lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'N > 1)),
+      (null, null, 1, "A") ::
+      (2, "b", 2, "B") ::
+      (3, "c", 3, "C") ::
+      (4, "d", 4, "D") ::
+      (null, null, 5, "E") ::
+      (null, null, 6, "F") :: Nil)
+    checkAnswer(
+      lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'l > 'L)),
+      (1, "a", 1, "A") ::
+      (2, "b", 2, "B") ::
+      (3, "c", 3, "C") ::
+      (4, "d", 4, "D") ::
+      (null, null, 5, "E") ::
+      (null, null, 6, "F") :: Nil)
+
+    // Make sure we are choosing right.outputPartitioning as the
+    // outputPartitioning for the outer join operator.
+    checkAnswer(
+      sql(
+        """
+          |SELECT l.a, count(*)
+          |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N)
+          |GROUP BY l.a
+        """.stripMargin),
+      (null, 6) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT r.N, count(*)
+          |FROM allNulls l RIGHT OUTER JOIN upperCaseData r ON (l.a = r.N)
+          |GROUP BY r.N
+        """.stripMargin),
+      (1, 1) ::
+      (2, 1) ::
+      (3, 1) ::
+      (4, 1) ::
+      (5, 1) ::
+      (6, 1) :: Nil)
   }
 
   test("full outer join") {
-    val left = upperCaseData.where('N <= 4).as('left)
-    val right = upperCaseData.where('N >= 3).as('right)
+    upperCaseData.where('N <= 4).registerTempTable("left")
+    upperCaseData.where('N >= 3).registerTempTable("right")
+
+    val left = UnresolvedRelation(None, "left", None)
+    val right = UnresolvedRelation(None, "right", None)
 
     checkAnswer(
       left.join(right, FullOuter, Some("left.N".attr === "right.N".attr)),
@@ -171,5 +299,74 @@ class JoinSuite extends QueryTest {
       (4, "D", 4, "D") ::
       (null, null, 5, "E") ::
       (null, null, 6, "F") :: Nil)
+    
+    checkAnswer(
+      left.join(right, FullOuter, Some(("left.N".attr === "right.N".attr) && ("left.N".attr !== 3))),
+      (1, "A", null, null) ::
+      (2, "B", null, null) ::
+      (3, "C", null, null) ::
+      (null, null, 3, "C") ::
+      (4, "D", 4, "D") ::
+      (null, null, 5, "E") ::
+      (null, null, 6, "F") :: Nil)
+    
+    checkAnswer(
+      left.join(right, FullOuter, Some(("left.N".attr === "right.N".attr) && ("right.N".attr !== 3))),
+      (1, "A", null, null) ::
+      (2, "B", null, null) ::
+      (3, "C", null, null) ::
+      (null, null, 3, "C") ::
+      (4, "D", 4, "D") ::
+      (null, null, 5, "E") ::
+      (null, null, 6, "F") :: Nil)
+
+    // Make sure we are UnknownPartitioning as the outputPartitioning for the outer join operator.
+    checkAnswer(
+      sql(
+        """
+          |SELECT l.a, count(*)
+          |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N)
+          |GROUP BY l.a
+        """.stripMargin),
+      (null, 10) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT r.N, count(*)
+          |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N)
+          |GROUP BY r.N
+        """.stripMargin),
+      (1, 1) ::
+      (2, 1) ::
+      (3, 1) ::
+      (4, 1) ::
+      (5, 1) ::
+      (6, 1) ::
+      (null, 4) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT l.N, count(*)
+          |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a)
+          |GROUP BY l.N
+        """.stripMargin),
+      (1, 1) ::
+      (2, 1) ::
+      (3, 1) ::
+      (4, 1) ::
+      (5, 1) ::
+      (6, 1) ::
+      (null, 4) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT r.a, count(*)
+          |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a)
+          |GROUP BY r.a
+        """.stripMargin),
+      (null, 10) :: Nil)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 8e1e1971d968b..1fd8d27b34c59 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -45,6 +45,7 @@ class QueryTest extends PlanTest {
             |${rdd.queryExecution}
             |== Exception ==
             |$e
+            |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
           """.stripMargin)
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
new file mode 100644
index 0000000000000..651cb735ab7d9
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -0,0 +1,46 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+
+class RowSuite extends FunSuite {
+
+  test("create row") {
+    val expected = new GenericMutableRow(4)
+    expected.update(0, 2147483647)
+    expected.update(1, "this is a string")
+    expected.update(2, false)
+    expected.update(3, null)
+    val actual1 = Row(2147483647, "this is a string", false, null)
+    assert(expected.size === actual1.size)
+    assert(expected.getInt(0) === actual1.getInt(0))
+    assert(expected.getString(1) === actual1.getString(1))
+    assert(expected.getBoolean(2) === actual1.getBoolean(2))
+    assert(expected(3) === actual1(3))
+
+    val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null))
+    assert(expected.size === actual2.size)
+    assert(expected.getInt(0) === actual2.getInt(0))
+    assert(expected.getString(1) === actual2.getString(1))
+    assert(expected.getBoolean(2) === actual2.getBoolean(2))
+    assert(expected(3) === actual2(3))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 93792f698cfaf..1a58d73d9e7f4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -28,50 +28,51 @@ class SQLConfSuite extends QueryTest {
   val testVal = "test.val.0"
 
   test("programmatic ways of basic setting and getting") {
-    TestSQLContext.settings.synchronized {
-      clear()
-      assert(getOption(testKey).isEmpty)
-      assert(getAll.toSet === Set())
+    clear()
+    assert(getOption(testKey).isEmpty)
+    assert(getAll.toSet === Set())
 
-      set(testKey, testVal)
-      assert(get(testKey) == testVal)
-      assert(get(testKey, testVal + "_") == testVal)
-      assert(getOption(testKey) == Some(testVal))
-      assert(contains(testKey))
+    set(testKey, testVal)
+    assert(get(testKey) == testVal)
+    assert(get(testKey, testVal + "_") == testVal)
+    assert(getOption(testKey) == Some(testVal))
+    assert(contains(testKey))
 
-      // Tests SQLConf as accessed from a SQLContext is mutable after
-      // the latter is initialized, unlike SparkConf inside a SparkContext.
-      assert(TestSQLContext.get(testKey) == testVal)
-      assert(TestSQLContext.get(testKey, testVal + "_") == testVal)
-      assert(TestSQLContext.getOption(testKey) == Some(testVal))
-      assert(TestSQLContext.contains(testKey))
+    // Tests SQLConf as accessed from a SQLContext is mutable after
+    // the latter is initialized, unlike SparkConf inside a SparkContext.
+    assert(TestSQLContext.get(testKey) == testVal)
+    assert(TestSQLContext.get(testKey, testVal + "_") == testVal)
+    assert(TestSQLContext.getOption(testKey) == Some(testVal))
+    assert(TestSQLContext.contains(testKey))
 
-      clear()
-    }
+    clear()
   }
 
   test("parse SQL set commands") {
-    TestSQLContext.settings.synchronized {
-      clear()
-      sql(s"set $testKey=$testVal")
-      assert(get(testKey, testVal + "_") == testVal)
-      assert(TestSQLContext.get(testKey, testVal + "_") == testVal)
+    clear()
+    sql(s"set $testKey=$testVal")
+    assert(get(testKey, testVal + "_") == testVal)
+    assert(TestSQLContext.get(testKey, testVal + "_") == testVal)
 
-      sql("set mapred.reduce.tasks=20")
-      assert(get("mapred.reduce.tasks", "0") == "20")
-      sql("set mapred.reduce.tasks = 40")
-      assert(get("mapred.reduce.tasks", "0") == "40")
+    sql("set some.property=20")
+    assert(get("some.property", "0") == "20")
+    sql("set some.property = 40")
+    assert(get("some.property", "0") == "40")
 
-      val key = "spark.sql.key"
-      val vs = "val0,val_1,val2.3,my_table"
-      sql(s"set $key=$vs")
-      assert(get(key, "0") == vs)
+    val key = "spark.sql.key"
+    val vs = "val0,val_1,val2.3,my_table"
+    sql(s"set $key=$vs")
+    assert(get(key, "0") == vs)
 
-      sql(s"set $key=")
-      assert(get(key, "0") == "")
+    sql(s"set $key=")
+    assert(get(key, "0") == "")
 
-      clear()
-    }
+    clear()
   }
 
+  test("deprecated property") {
+    clear()
+    sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
+    assert(get(SQLConf.SHUFFLE_PARTITIONS) == "10")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5c6701e203d17..9b2a36d33fca7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
 import org.apache.spark.sql.test._
 
 /* Implicits */
@@ -36,6 +34,21 @@ class SQLQuerySuite extends QueryTest {
       "test")
   }
 
+  test("SPARK-2407 Added Parser of SQL SUBSTR()") {
+    checkAnswer(
+      sql("SELECT substr(tableName, 1, 2) FROM tableName"),
+      "te")
+    checkAnswer(
+      sql("SELECT substr(tableName, 3) FROM tableName"),
+      "st")
+    checkAnswer(
+      sql("SELECT substring(tableName, 1, 2) FROM tableName"),
+      "te")
+    checkAnswer(
+      sql("SELECT substring(tableName, 3) FROM tableName"),
+      "st")
+  }
+
   test("index into array") {
     checkAnswer(
       sql("SELECT data, data[0], data[0] + data[1], data[0 + 1] FROM arrayData"),
@@ -385,41 +398,131 @@ class SQLQuerySuite extends QueryTest {
       sql("SELECT * FROM upperCaseData EXCEPT SELECT * FROM upperCaseData "), Nil)
   }
 
+ test("INTERSECT") {
+    checkAnswer(
+      sql("SELECT * FROM lowerCaseData INTERSECT SELECT * FROM lowerCaseData"),
+      (1, "a") ::
+      (2, "b") ::
+      (3, "c") ::
+      (4, "d") :: Nil)
+    checkAnswer(
+      sql("SELECT * FROM lowerCaseData INTERSECT SELECT * FROM upperCaseData"), Nil)
+  }
+
   test("SET commands semantics using sql()") {
-    TestSQLContext.settings.synchronized {
-      clear()
-      val testKey = "test.key.0"
-      val testVal = "test.val.0"
-      val nonexistentKey = "nonexistent"
-
-      // "set" itself returns all config variables currently specified in SQLConf.
-      assert(sql("SET").collect().size == 0)
-
-      // "set key=val"
-      sql(s"SET $testKey=$testVal")
-      checkAnswer(
-        sql("SET"),
-        Seq(Seq(testKey, testVal))
-      )
-
-      sql(s"SET ${testKey + testKey}=${testVal + testVal}")
-      checkAnswer(
-        sql("set"),
-        Seq(
-          Seq(testKey, testVal),
-          Seq(testKey + testKey, testVal + testVal))
-      )
-
-      // "set key"
-      checkAnswer(
-        sql(s"SET $testKey"),
-        Seq(Seq(testKey, testVal))
-      )
-      checkAnswer(
-        sql(s"SET $nonexistentKey"),
-        Seq(Seq(nonexistentKey, "<undefined>"))
-      )
-      clear()
+    clear()
+    val testKey = "test.key.0"
+    val testVal = "test.val.0"
+    val nonexistentKey = "nonexistent"
+
+    // "set" itself returns all config variables currently specified in SQLConf.
+    assert(sql("SET").collect().size == 0)
+
+    // "set key=val"
+    sql(s"SET $testKey=$testVal")
+    checkAnswer(
+      sql("SET"),
+      Seq(Seq(s"$testKey=$testVal"))
+    )
+
+    sql(s"SET ${testKey + testKey}=${testVal + testVal}")
+    checkAnswer(
+      sql("set"),
+      Seq(
+        Seq(s"$testKey=$testVal"),
+        Seq(s"${testKey + testKey}=${testVal + testVal}"))
+    )
+
+    // "set key"
+    checkAnswer(
+      sql(s"SET $testKey"),
+      Seq(Seq(s"$testKey=$testVal"))
+    )
+    checkAnswer(
+      sql(s"SET $nonexistentKey"),
+      Seq(Seq(s"$nonexistentKey=<undefined>"))
+    )
+    clear()
+  }
+
+  test("apply schema") {
+    val schema1 = StructType(
+      StructField("f1", IntegerType, false) ::
+      StructField("f2", StringType, false) ::
+      StructField("f3", BooleanType, false) ::
+      StructField("f4", IntegerType, true) :: Nil)
+
+    val rowRDD1 = unparsedStrings.map { r =>
+      val values = r.split(",").map(_.trim)
+      val v4 = try values(3).toInt catch {
+        case _: NumberFormatException => null
+      }
+      Row(values(0).toInt, values(1), values(2).toBoolean, v4)
+    }
+
+    val schemaRDD1 = applySchema(rowRDD1, schema1)
+    schemaRDD1.registerTempTable("applySchema1")
+    checkAnswer(
+      sql("SELECT * FROM applySchema1"),
+      (1, "A1", true, null) ::
+      (2, "B2", false, null) ::
+      (3, "C3", true, null) ::
+      (4, "D4", true, 2147483644) :: Nil)
+
+    checkAnswer(
+      sql("SELECT f1, f4 FROM applySchema1"),
+      (1, null) ::
+      (2, null) ::
+      (3, null) ::
+      (4, 2147483644) :: Nil)
+
+    val schema2 = StructType(
+      StructField("f1", StructType(
+        StructField("f11", IntegerType, false) ::
+        StructField("f12", BooleanType, false) :: Nil), false) ::
+      StructField("f2", MapType(StringType, IntegerType, true), false) :: Nil)
+
+    val rowRDD2 = unparsedStrings.map { r =>
+      val values = r.split(",").map(_.trim)
+      val v4 = try values(3).toInt catch {
+        case _: NumberFormatException => null
+      }
+      Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
+    }
+
+    val schemaRDD2 = applySchema(rowRDD2, schema2)
+    schemaRDD2.registerTempTable("applySchema2")
+    checkAnswer(
+      sql("SELECT * FROM applySchema2"),
+      (Seq(1, true), Map("A1" -> null)) ::
+      (Seq(2, false), Map("B2" -> null)) ::
+      (Seq(3, true), Map("C3" -> null)) ::
+      (Seq(4, true), Map("D4" -> 2147483644)) :: Nil)
+
+    checkAnswer(
+      sql("SELECT f1.f11, f2['D4'] FROM applySchema2"),
+      (1, null) ::
+      (2, null) ::
+      (3, null) ::
+      (4, 2147483644) :: Nil)
+
+    // The value of a MapType column can be a mutable map.
+    val rowRDD3 = unparsedStrings.map { r =>
+      val values = r.split(",").map(_.trim)
+      val v4 = try values(3).toInt catch {
+        case _: NumberFormatException => null
+      }
+      Row(Row(values(0).toInt, values(2).toBoolean), scala.collection.mutable.Map(values(1) -> v4))
     }
+
+    val schemaRDD3 = applySchema(rowRDD3, schema2)
+    schemaRDD3.registerTempTable("applySchema3")
+
+    checkAnswer(
+      sql("SELECT f1.f11, f2['D4'] FROM applySchema3"),
+      (1, null) ::
+      (2, null) ::
+      (3, null) ::
+      (4, 2147483644) :: Nil)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index f2934da9a031d..5b84c658db942 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -61,7 +61,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
                            BigDecimal(1), new Timestamp(12345), Seq(1,2,3))
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerAsTable("reflectData")
+    rdd.registerTempTable("reflectData")
 
     assert(sql("SELECT * FROM reflectData").collect().head === data.productIterator.toSeq)
   }
@@ -69,7 +69,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with nulls") {
     val data = NullReflectData(null, null, null, null, null, null, null)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerAsTable("reflectNullData")
+    rdd.registerTempTable("reflectNullData")
 
     assert(sql("SELECT * FROM reflectNullData").collect().head === Seq.fill(7)(null))
   }
@@ -77,7 +77,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with Nones") {
     val data = OptionalReflectData(None, None, None, None, None, None, None)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerAsTable("reflectOptionalData")
+    rdd.registerTempTable("reflectOptionalData")
 
     assert(sql("SELECT * FROM reflectOptionalData").collect().head === Seq.fill(7)(null))
   }
@@ -85,7 +85,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   // Equality is broken for Arrays, so we test that separately.
   test("query binary data") {
     val rdd = sparkContext.parallelize(ReflectBinary(Array[Byte](1)) :: Nil)
-    rdd.registerAsTable("reflectBinary")
+    rdd.registerTempTable("reflectBinary")
 
     val result = sql("SELECT data FROM reflectBinary").collect().head(0).asInstanceOf[Array[Byte]]
     assert(result.toSeq === Seq[Byte](1))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 330b20b315d63..c3ec82fb69778 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -17,18 +17,20 @@
 
 package org.apache.spark.sql
 
+import java.sql.Timestamp
+
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.test._
 
 /* Implicits */
-import TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext._
 
 case class TestData(key: Int, value: String)
 
 object TestData {
   val testData: SchemaRDD = TestSQLContext.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString)))
-  testData.registerAsTable("testData")
+  testData.registerTempTable("testData")
 
   case class LargeAndSmallInts(a: Int, b: Int)
   val largeAndSmallInts: SchemaRDD =
@@ -39,8 +41,8 @@ object TestData {
       LargeAndSmallInts(2, 2) ::
       LargeAndSmallInts(2147483646, 1) ::
       LargeAndSmallInts(3, 2) :: Nil)
-  largeAndSmallInts.registerAsTable("largeAndSmallInts")
-  
+  largeAndSmallInts.registerTempTable("largeAndSmallInts")
+
   case class TestData2(a: Int, b: Int)
   val testData2: SchemaRDD =
     TestSQLContext.sparkContext.parallelize(
@@ -50,7 +52,7 @@ object TestData {
       TestData2(2, 2) ::
       TestData2(3, 1) ::
       TestData2(3, 2) :: Nil)
-  testData2.registerAsTable("testData2")
+  testData2.registerTempTable("testData2")
 
   // TODO: There is no way to express null primitives as case classes currently...
   val testData3 =
@@ -69,7 +71,7 @@ object TestData {
       UpperCaseData(4, "D") ::
       UpperCaseData(5, "E") ::
       UpperCaseData(6, "F") :: Nil)
-  upperCaseData.registerAsTable("upperCaseData")
+  upperCaseData.registerTempTable("upperCaseData")
 
   case class LowerCaseData(n: Int, l: String)
   val lowerCaseData =
@@ -78,14 +80,14 @@ object TestData {
       LowerCaseData(2, "b") ::
       LowerCaseData(3, "c") ::
       LowerCaseData(4, "d") :: Nil)
-  lowerCaseData.registerAsTable("lowerCaseData")
+  lowerCaseData.registerTempTable("lowerCaseData")
 
   case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]])
   val arrayData =
     TestSQLContext.sparkContext.parallelize(
       ArrayData(Seq(1,2,3), Seq(Seq(1,2,3))) ::
       ArrayData(Seq(2,3,4), Seq(Seq(2,3,4))) :: Nil)
-  arrayData.registerAsTable("arrayData")
+  arrayData.registerTempTable("arrayData")
 
   case class MapData(data: Map[Int, String])
   val mapData =
@@ -95,18 +97,18 @@ object TestData {
       MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
       MapData(Map(1 -> "a4", 2 -> "b4")) ::
       MapData(Map(1 -> "a5")) :: Nil)
-  mapData.registerAsTable("mapData")
+  mapData.registerTempTable("mapData")
 
   case class StringData(s: String)
   val repeatedData =
     TestSQLContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
-  repeatedData.registerAsTable("repeatedData")
+  repeatedData.registerTempTable("repeatedData")
 
   val nullableRepeatedData =
     TestSQLContext.sparkContext.parallelize(
       List.fill(2)(StringData(null)) ++
       List.fill(2)(StringData("test")))
-  nullableRepeatedData.registerAsTable("nullableRepeatedData")
+  nullableRepeatedData.registerTempTable("nullableRepeatedData")
 
   case class NullInts(a: Integer)
   val nullInts =
@@ -116,7 +118,15 @@ object TestData {
       NullInts(3) ::
       NullInts(null) :: Nil
     )
-  nullInts.registerAsTable("nullInts")
+  nullInts.registerTempTable("nullInts")
+
+  val allNulls =
+    TestSQLContext.sparkContext.parallelize(
+      NullInts(null) ::
+      NullInts(null) ::
+      NullInts(null) ::
+      NullInts(null) :: Nil)
+  allNulls.registerTempTable("allNulls")
 
   case class NullStrings(n: Int, s: String)
   val nullStrings =
@@ -124,8 +134,21 @@ object TestData {
       NullStrings(1, "abc") ::
       NullStrings(2, "ABC") ::
       NullStrings(3, null) :: Nil)
-  nullStrings.registerAsTable("nullStrings")
+  nullStrings.registerTempTable("nullStrings")
 
   case class TableName(tableName: String)
-  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).registerAsTable("tableName")
+  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).registerTempTable("tableName")
+
+  val unparsedStrings =
+    TestSQLContext.sparkContext.parallelize(
+      "1, A1, true, null" ::
+      "2, B2, false, null" ::
+      "3, C3, true, null" ::
+      "4, D4, true, 2147483644" :: Nil)
+
+  case class TimestampField(time: Timestamp)
+  val timestamps = TestSQLContext.sparkContext.parallelize((1 to 3).map { i =>
+    TimestampField(new Timestamp(i))
+  })
+  timestamps.registerTempTable("timestamps")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
new file mode 100644
index 0000000000000..76aa9b0081d7e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test._
+
+/* Implicits */
+import TestSQLContext._
+
+class UDFSuite extends QueryTest {
+
+  test("Simple UDF") {
+    registerFunction("strLenScala", (_: String).length)
+    assert(sql("SELECT strLenScala('test')").first().getInt(0) === 4)
+  }
+
+  test("TwoArgument UDF") {
+    registerFunction("strLenScala", (_: String).length + (_:Int))
+    assert(sql("SELECT strLenScala('test', 1)").first().getInt(0) === 5)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
index 020baf0c7ec6f..203ff847e94cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
@@ -59,7 +59,7 @@ class JavaSQLSuite extends FunSuite {
     val rdd = javaCtx.parallelize(person :: Nil)
     val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[PersonBean])
 
-    schemaRDD.registerAsTable("people")
+    schemaRDD.registerTempTable("people")
     javaSqlCtx.sql("SELECT * FROM people").collect()
   }
 
@@ -76,7 +76,7 @@ class JavaSQLSuite extends FunSuite {
 
     val rdd = javaCtx.parallelize(bean :: Nil)
     val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerAsTable("allTypes")
+    schemaRDD.registerTempTable("allTypes")
 
     assert(
       javaSqlCtx.sql(
@@ -101,7 +101,7 @@ class JavaSQLSuite extends FunSuite {
 
     val rdd = javaCtx.parallelize(bean :: Nil)
     val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerAsTable("allTypes")
+    schemaRDD.registerTempTable("allTypes")
 
     assert(
       javaSqlCtx.sql(
@@ -127,7 +127,7 @@ class JavaSQLSuite extends FunSuite {
 
     var schemaRDD = javaSqlCtx.jsonRDD(rdd)
 
-    schemaRDD.registerAsTable("jsonTable1")
+    schemaRDD.registerTempTable("jsonTable1")
 
     assert(
       javaSqlCtx.sql("select * from jsonTable1").collect.head.row ===
@@ -144,7 +144,7 @@ class JavaSQLSuite extends FunSuite {
     rdd.saveAsTextFile(path)
     schemaRDD = javaSqlCtx.jsonFile(path)
 
-    schemaRDD.registerAsTable("jsonTable2")
+    schemaRDD.registerTempTable("jsonTable2")
 
     assert(
       javaSqlCtx.sql("select * from jsonTable2").collect.head.row ===
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/java/ScalaSideDataTypeConversionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/java/ScalaSideDataTypeConversionSuite.scala
new file mode 100644
index 0000000000000..ff1debff0f8c1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/java/ScalaSideDataTypeConversionSuite.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java
+
+import org.apache.spark.sql.types.util.DataTypeConversions
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.{DataType => SDataType, StructField => SStructField}
+import org.apache.spark.sql.{StructType => SStructType}
+import DataTypeConversions._
+
+class ScalaSideDataTypeConversionSuite extends FunSuite {
+
+  def checkDataType(scalaDataType: SDataType) {
+    val javaDataType = asJavaDataType(scalaDataType)
+    val actual = asScalaDataType(javaDataType)
+    assert(scalaDataType === actual, s"Converted data type ${actual} " +
+      s"does not equal the expected data type ${scalaDataType}")
+  }
+
+  test("convert data types") {
+    // Simple DataTypes.
+    checkDataType(org.apache.spark.sql.StringType)
+    checkDataType(org.apache.spark.sql.BinaryType)
+    checkDataType(org.apache.spark.sql.BooleanType)
+    checkDataType(org.apache.spark.sql.TimestampType)
+    checkDataType(org.apache.spark.sql.DecimalType)
+    checkDataType(org.apache.spark.sql.DoubleType)
+    checkDataType(org.apache.spark.sql.FloatType)
+    checkDataType(org.apache.spark.sql.ByteType)
+    checkDataType(org.apache.spark.sql.IntegerType)
+    checkDataType(org.apache.spark.sql.LongType)
+    checkDataType(org.apache.spark.sql.ShortType)
+
+    // Simple ArrayType.
+    val simpleScalaArrayType =
+      org.apache.spark.sql.ArrayType(org.apache.spark.sql.StringType, true)
+    checkDataType(simpleScalaArrayType)
+
+    // Simple MapType.
+    val simpleScalaMapType =
+      org.apache.spark.sql.MapType(org.apache.spark.sql.StringType, org.apache.spark.sql.LongType)
+    checkDataType(simpleScalaMapType)
+
+    // Simple StructType.
+    val simpleScalaStructType = SStructType(
+      SStructField("a", org.apache.spark.sql.DecimalType, false) ::
+      SStructField("b", org.apache.spark.sql.BooleanType, true) ::
+      SStructField("c", org.apache.spark.sql.LongType, true) ::
+      SStructField("d", org.apache.spark.sql.BinaryType, false) :: Nil)
+    checkDataType(simpleScalaStructType)
+
+    // Complex StructType.
+    val complexScalaStructType = SStructType(
+      SStructField("simpleArray", simpleScalaArrayType, true) ::
+      SStructField("simpleMap", simpleScalaMapType, true) ::
+      SStructField("simpleStruct", simpleScalaStructType, true) ::
+      SStructField("boolean", org.apache.spark.sql.BooleanType, false) :: Nil)
+    checkDataType(complexScalaStructType)
+
+    // Complex ArrayType.
+    val complexScalaArrayType =
+      org.apache.spark.sql.ArrayType(complexScalaStructType, true)
+    checkDataType(complexScalaArrayType)
+
+    // Complex MapType.
+    val complexScalaMapType =
+      org.apache.spark.sql.MapType(complexScalaStructType, complexScalaArrayType, false)
+    checkDataType(complexScalaMapType)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 78640b876d4aa..5f61fb5e16ea3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -22,14 +22,15 @@ import org.scalatest.FunSuite
 import org.apache.spark.sql.catalyst.types._
 
 class ColumnStatsSuite extends FunSuite {
-  testColumnStats(classOf[BooleanColumnStats], BOOLEAN)
-  testColumnStats(classOf[ByteColumnStats],    BYTE)
-  testColumnStats(classOf[ShortColumnStats],   SHORT)
-  testColumnStats(classOf[IntColumnStats],     INT)
-  testColumnStats(classOf[LongColumnStats],    LONG)
-  testColumnStats(classOf[FloatColumnStats],   FLOAT)
-  testColumnStats(classOf[DoubleColumnStats],  DOUBLE)
-  testColumnStats(classOf[StringColumnStats],  STRING)
+  testColumnStats(classOf[BooleanColumnStats],   BOOLEAN)
+  testColumnStats(classOf[ByteColumnStats],      BYTE)
+  testColumnStats(classOf[ShortColumnStats],     SHORT)
+  testColumnStats(classOf[IntColumnStats],       INT)
+  testColumnStats(classOf[LongColumnStats],      LONG)
+  testColumnStats(classOf[FloatColumnStats],     FLOAT)
+  testColumnStats(classOf[DoubleColumnStats],    DOUBLE)
+  testColumnStats(classOf[StringColumnStats],    STRING)
+  testColumnStats(classOf[TimestampColumnStats], TIMESTAMP)
 
   def testColumnStats[T <: NativeType, U <: NativeColumnStats[T]](
       columnStatsClass: Class[U],
@@ -39,7 +40,7 @@ class ColumnStatsSuite extends FunSuite {
 
     test(s"$columnStatsName: empty") {
       val columnStats = columnStatsClass.newInstance()
-      expectResult(columnStats.initialBounds, "Wrong initial bounds") {
+      assertResult(columnStats.initialBounds, "Wrong initial bounds") {
         (columnStats.lowerBound, columnStats.upperBound)
       }
     }
@@ -54,8 +55,8 @@ class ColumnStatsSuite extends FunSuite {
       val values = rows.map(_.head.asInstanceOf[T#JvmType])
       val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#JvmType]]
 
-      expectResult(values.min(ordering), "Wrong lower bound")(columnStats.lowerBound)
-      expectResult(values.max(ordering), "Wrong upper bound")(columnStats.upperBound)
+      assertResult(values.min(ordering), "Wrong lower bound")(columnStats.lowerBound)
+      assertResult(values.max(ordering), "Wrong upper bound")(columnStats.upperBound)
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 71be41056768f..75f653f3280bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -18,10 +18,11 @@
 package org.apache.spark.sql.columnar
 
 import java.nio.ByteBuffer
+import java.sql.Timestamp
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.execution.SparkSqlSerializer
@@ -32,10 +33,10 @@ class ColumnTypeSuite extends FunSuite with Logging {
   test("defaultSize") {
     val checks = Map(
       INT -> 4, SHORT -> 2, LONG -> 8, BYTE -> 1, DOUBLE -> 8, FLOAT -> 4,
-      BOOLEAN -> 1, STRING -> 8, BINARY -> 16, GENERIC -> 16)
+      BOOLEAN -> 1, STRING -> 8, TIMESTAMP -> 12, BINARY -> 16, GENERIC -> 16)
 
     checks.foreach { case (columnType, expectedSize) =>
-      expectResult(expectedSize, s"Wrong defaultSize for $columnType") {
+      assertResult(expectedSize, s"Wrong defaultSize for $columnType") {
         columnType.defaultSize
       }
     }
@@ -47,19 +48,20 @@ class ColumnTypeSuite extends FunSuite with Logging {
         value: JvmType,
         expected: Int) {
 
-      expectResult(expected, s"Wrong actualSize for $columnType") {
+      assertResult(expected, s"Wrong actualSize for $columnType") {
         columnType.actualSize(value)
       }
     }
 
-    checkActualSize(INT,     Int.MaxValue,    4)
-    checkActualSize(SHORT,   Short.MaxValue,  2)
-    checkActualSize(LONG,    Long.MaxValue,   8)
-    checkActualSize(BYTE,    Byte.MaxValue,   1)
-    checkActualSize(DOUBLE,  Double.MaxValue, 8)
-    checkActualSize(FLOAT,   Float.MaxValue,  4)
-    checkActualSize(BOOLEAN, true,            1)
-    checkActualSize(STRING,  "hello",         4 + "hello".getBytes("utf-8").length)
+    checkActualSize(INT,       Int.MaxValue,      4)
+    checkActualSize(SHORT,     Short.MaxValue,    2)
+    checkActualSize(LONG,      Long.MaxValue,     8)
+    checkActualSize(BYTE,      Byte.MaxValue,     1)
+    checkActualSize(DOUBLE,    Double.MaxValue,   8)
+    checkActualSize(FLOAT,     Float.MaxValue,    4)
+    checkActualSize(BOOLEAN,   true,              1)
+    checkActualSize(STRING,    "hello",           4 + "hello".getBytes("utf-8").length)
+    checkActualSize(TIMESTAMP, new Timestamp(0L), 12)
 
     val binary = Array.fill[Byte](4)(0: Byte)
     checkActualSize(BINARY,  binary, 4 + 4)
@@ -127,7 +129,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
     val length = buffer.getInt()
     assert(length === serializedObj.length)
 
-    expectResult(obj, "Deserialized object didn't equal to the original object") {
+    assertResult(obj, "Deserialized object didn't equal to the original object") {
       val bytes = new Array[Byte](length)
       buffer.get(bytes, 0, length)
       SparkSqlSerializer.deserialize(bytes)
@@ -136,7 +138,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
     buffer.rewind()
     buffer.putInt(serializedObj.length).put(serializedObj)
 
-    expectResult(obj, "Deserialized object didn't equal to the original object") {
+    assertResult(obj, "Deserialized object didn't equal to the original object") {
       buffer.rewind()
       SparkSqlSerializer.deserialize(GENERIC.extract(buffer))
     }
@@ -164,7 +166,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
 
       buffer.rewind()
       seq.foreach { expected =>
-        logger.info("buffer = " + buffer + ", expected = " + expected)
+        logInfo("buffer = " + buffer + ", expected = " + expected)
         val extracted = columnType.extract(buffer)
         assert(
           expected === extracted,
@@ -188,17 +190,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
   }
 
   private def hexDump(value: Any): String = {
-    if (value.isInstanceOf[String]) {
-      val sb = new StringBuilder()
-      for (ch <- value.asInstanceOf[String].toCharArray) {
-        sb.append(Integer.toHexString(ch & 0xffff)).append(' ')
-      }
-      if (! sb.isEmpty) sb.setLength(sb.length - 1)
-      sb.toString()
-    } else {
-      // for now ..
-      hexDump(value.toString)
-    }
+    value.toString.map(ch => Integer.toHexString(ch & 0xffff)).mkString(" ")
   }
 
   private def dumpBuffer(buff: ByteBuffer): Any = {
@@ -207,7 +199,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
       val b = buff.get()
       sb.append(Integer.toHexString(b & 0xff)).append(' ')
     }
-    if (! sb.isEmpty) sb.setLength(sb.length - 1)
+    if (sb.nonEmpty) sb.setLength(sb.length - 1)
     sb.toString()
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index 04bdc43d95328..38b04dd959f70 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.columnar
 import scala.collection.immutable.HashSet
 import scala.util.Random
 
+import java.sql.Timestamp
+
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.catalyst.types.{DataType, NativeType}
@@ -39,15 +41,19 @@ object ColumnarTestUtils {
     }
 
     (columnType match {
-      case BYTE    => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
-      case SHORT   => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
-      case INT     => Random.nextInt()
-      case LONG    => Random.nextLong()
-      case FLOAT   => Random.nextFloat()
-      case DOUBLE  => Random.nextDouble()
-      case STRING  => Random.nextString(Random.nextInt(32))
-      case BOOLEAN => Random.nextBoolean()
-      case BINARY  => randomBytes(Random.nextInt(32))
+      case BYTE      => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
+      case SHORT     => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
+      case INT       => Random.nextInt()
+      case LONG      => Random.nextLong()
+      case FLOAT     => Random.nextFloat()
+      case DOUBLE    => Random.nextDouble()
+      case STRING    => Random.nextString(Random.nextInt(32))
+      case BOOLEAN   => Random.nextBoolean()
+      case BINARY    => randomBytes(Random.nextInt(32))
+      case TIMESTAMP =>
+        val timestamp = new Timestamp(Random.nextLong())
+        timestamp.setNanos(Random.nextInt(999999999))
+        timestamp
       case _ =>
         // Using a random one-element map instead of an arbitrary object
         Map(Random.nextInt() -> Random.nextString(Random.nextInt(32)))
@@ -96,5 +102,4 @@ object ColumnarTestUtils {
 
     (values, rows)
   }
-
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 86727b93f3659..b561b44ad7ee2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -73,4 +73,16 @@ class InMemoryColumnarQuerySuite extends QueryTest {
       sql("SELECT * FROM nullableRepeatedData"),
       nullableRepeatedData.collect().toSeq)
   }
+
+  test("SPARK-2729 regression: timestamp data type") {
+    checkAnswer(
+      sql("SELECT time FROM timestamps"),
+      timestamps.collect().toSeq)
+
+    TestSQLContext.cacheTable("timestamps")
+
+    checkAnswer(
+      sql("SELECT time FROM timestamps"),
+      timestamps.collect().toSeq)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index 35ab14cbc353d..3baa6f8ec0c83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -41,7 +41,7 @@ object TestNullableColumnAccessor {
 class NullableColumnAccessorSuite extends FunSuite {
   import ColumnarTestUtils._
 
-  Seq(INT, LONG, SHORT, BOOLEAN, BYTE, STRING, DOUBLE, FLOAT, BINARY, GENERIC).foreach {
+  Seq(INT, LONG, SHORT, BOOLEAN, BYTE, STRING, DOUBLE, FLOAT, BINARY, GENERIC, TIMESTAMP).foreach {
     testNullableColumnAccessor(_)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index d9d1e1bfddb75..dc813fe146c47 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -37,7 +37,7 @@ object TestNullableColumnBuilder {
 class NullableColumnBuilderSuite extends FunSuite {
   import ColumnarTestUtils._
 
-  Seq(INT, LONG, SHORT, BOOLEAN, BYTE, STRING, DOUBLE, FLOAT, BINARY, GENERIC).foreach {
+  Seq(INT, LONG, SHORT, BOOLEAN, BYTE, STRING, DOUBLE, FLOAT, BINARY, GENERIC, TIMESTAMP).foreach {
     testNullableColumnBuilder(_)
   }
 
@@ -48,8 +48,8 @@ class NullableColumnBuilderSuite extends FunSuite {
       val columnBuilder = TestNullableColumnBuilder(columnType)
       val buffer = columnBuilder.build()
 
-      expectResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
-      expectResult(0, "Wrong null count")(buffer.getInt())
+      assertResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
+      assertResult(0, "Wrong null count")(buffer.getInt())
       assert(!buffer.hasRemaining)
     }
 
@@ -63,8 +63,8 @@ class NullableColumnBuilderSuite extends FunSuite {
 
       val buffer = columnBuilder.build()
 
-      expectResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
-      expectResult(0, "Wrong null count")(buffer.getInt())
+      assertResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
+      assertResult(0, "Wrong null count")(buffer.getInt())
     }
 
     test(s"$typeName column builder: null values") {
@@ -79,11 +79,11 @@ class NullableColumnBuilderSuite extends FunSuite {
 
       val buffer = columnBuilder.build()
 
-      expectResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
-      expectResult(4, "Wrong null count")(buffer.getInt())
+      assertResult(columnType.typeId, "Wrong column type ID")(buffer.getInt())
+      assertResult(4, "Wrong null count")(buffer.getInt())
 
       // For null positions
-      (1 to 7 by 2).foreach(expectResult(_, "Wrong null position")(buffer.getInt()))
+      (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt()))
 
       // For non-null values
       (0 until 4).foreach { _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
index 93259a19b9fe7..5fba00480967c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
@@ -48,18 +48,18 @@ class BooleanBitSetSuite extends FunSuite {
     }
 
     // 4 extra bytes for compression scheme type ID
-    expectResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
+    assertResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
 
     // Skips column header
     buffer.position(headerSize)
-    expectResult(BooleanBitSet.typeId, "Wrong compression scheme ID")(buffer.getInt())
-    expectResult(count, "Wrong element count")(buffer.getInt())
+    assertResult(BooleanBitSet.typeId, "Wrong compression scheme ID")(buffer.getInt())
+    assertResult(count, "Wrong element count")(buffer.getInt())
 
     var word = 0: Long
     for (i <- 0 until count) {
       val bit = i % BITS_PER_LONG
       word = if (bit == 0) buffer.getLong() else word
-      expectResult(values(i), s"Wrong value in compressed buffer, index=$i") {
+      assertResult(values(i), s"Wrong value in compressed buffer, index=$i") {
         (word & ((1: Long) << bit)) != 0
       }
     }
@@ -75,7 +75,7 @@ class BooleanBitSetSuite extends FunSuite {
     if (values.nonEmpty) {
       values.foreach {
         assert(decoder.hasNext)
-        expectResult(_, "Wrong decoded value")(decoder.next())
+        assertResult(_, "Wrong decoded value")(decoder.next())
       }
     }
     assert(!decoder.hasNext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
index 198dcd8819341..d8ae2a26778c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/DictionaryEncodingSuite.scala
@@ -71,22 +71,22 @@ class DictionaryEncodingSuite extends FunSuite {
         // 2 bytes for each `Short`
         val compressedSize = 4 + dictionarySize + 2 * inputSeq.length
         // 4 extra bytes for compression scheme type ID
-        expectResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
+        assertResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
 
         // Skips column header
         buffer.position(headerSize)
-        expectResult(DictionaryEncoding.typeId, "Wrong compression scheme ID")(buffer.getInt())
+        assertResult(DictionaryEncoding.typeId, "Wrong compression scheme ID")(buffer.getInt())
 
         val dictionary = buildDictionary(buffer).toMap
 
         dictValues.foreach { i =>
-          expectResult(i, "Wrong dictionary entry") {
+          assertResult(i, "Wrong dictionary entry") {
             dictionary(values(i))
           }
         }
 
         inputSeq.foreach { i =>
-          expectResult(i.toShort, "Wrong column element value")(buffer.getShort())
+          assertResult(i.toShort, "Wrong column element value")(buffer.getShort())
         }
 
         // -------------
@@ -101,7 +101,7 @@ class DictionaryEncodingSuite extends FunSuite {
         if (inputSeq.nonEmpty) {
           inputSeq.foreach { i =>
             assert(decoder.hasNext)
-            expectResult(values(i), "Wrong decoded value")(decoder.next())
+            assertResult(values(i), "Wrong decoded value")(decoder.next())
           }
         }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
index 46af6e001c633..17619dcf974e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/IntegralDeltaSuite.scala
@@ -69,21 +69,21 @@ class IntegralDeltaSuite extends FunSuite {
       })
 
       // 4 extra bytes for compression scheme type ID
-      expectResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
+      assertResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
 
       buffer.position(headerSize)
-      expectResult(scheme.typeId, "Wrong compression scheme ID")(buffer.getInt())
+      assertResult(scheme.typeId, "Wrong compression scheme ID")(buffer.getInt())
 
       if (input.nonEmpty) {
-        expectResult(Byte.MinValue, "The first byte should be an escaping mark")(buffer.get())
-        expectResult(input.head, "The first value is wrong")(columnType.extract(buffer))
+        assertResult(Byte.MinValue, "The first byte should be an escaping mark")(buffer.get())
+        assertResult(input.head, "The first value is wrong")(columnType.extract(buffer))
 
         (input.tail, deltas).zipped.foreach { (value, delta) =>
           if (math.abs(delta) <= Byte.MaxValue) {
-            expectResult(delta, "Wrong delta")(buffer.get())
+            assertResult(delta, "Wrong delta")(buffer.get())
           } else {
-            expectResult(Byte.MinValue, "Expecting escaping mark here")(buffer.get())
-            expectResult(value, "Wrong value")(columnType.extract(buffer))
+            assertResult(Byte.MinValue, "Expecting escaping mark here")(buffer.get())
+            assertResult(value, "Wrong value")(columnType.extract(buffer))
           }
         }
       }
@@ -99,7 +99,7 @@ class IntegralDeltaSuite extends FunSuite {
       if (input.nonEmpty) {
         input.foreach{
           assert(decoder.hasNext)
-          expectResult(_, "Wrong decoded value")(decoder.next())
+          assertResult(_, "Wrong decoded value")(decoder.next())
         }
       }
       assert(!decoder.hasNext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
index d3b73ba19d476..40115beb98899 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/RunLengthEncodingSuite.scala
@@ -61,15 +61,15 @@ class RunLengthEncodingSuite extends FunSuite {
       }.sum
 
       // 4 extra bytes for compression scheme type ID
-      expectResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
+      assertResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
 
       // Skips column header
       buffer.position(headerSize)
-      expectResult(RunLengthEncoding.typeId, "Wrong compression scheme ID")(buffer.getInt())
+      assertResult(RunLengthEncoding.typeId, "Wrong compression scheme ID")(buffer.getInt())
 
       inputRuns.foreach { case (index, run) =>
-        expectResult(values(index), "Wrong column element value")(columnType.extract(buffer))
-        expectResult(run, "Wrong run length")(buffer.getInt())
+        assertResult(values(index), "Wrong column element value")(columnType.extract(buffer))
+        assertResult(run, "Wrong run length")(buffer.getInt())
       }
 
       // -------------
@@ -84,7 +84,7 @@ class RunLengthEncodingSuite extends FunSuite {
       if (inputSeq.nonEmpty) {
         inputSeq.foreach { i =>
           assert(decoder.hasNext)
-          expectResult(values(i), "Wrong decoded value")(decoder.next())
+          assertResult(values(i), "Wrong decoded value")(decoder.next())
         }
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 215618e852eb2..76b1724471442 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -39,22 +39,22 @@ class PlannerSuite extends FunSuite {
 
   test("count is partially aggregated") {
     val query = testData.groupBy('value)(Count('key)).queryExecution.analyzed
-    val planned = PartialAggregation(query).head
-    val aggregations = planned.collect { case a: Aggregate => a }
+    val planned = HashAggregation(query).head
+    val aggregations = planned.collect { case n if n.nodeName contains "Aggregate" => n }
 
     assert(aggregations.size === 2)
   }
 
   test("count distinct is not partially aggregated") {
     val query = testData.groupBy('value)(CountDistinct('key :: Nil)).queryExecution.analyzed
-    val planned = PartialAggregation(query)
+    val planned = HashAggregation(query)
     assert(planned.isEmpty)
   }
 
   test("mixed aggregates are not partially aggregated") {
     val query =
       testData.groupBy('value)(Count('value), CountDistinct('key :: Nil)).queryExecution.analyzed
-    val planned = PartialAggregation(query)
+    val planned = HashAggregation(query)
     assert(planned.isEmpty)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala
index e55648b8ed15a..2cab5e0c44d92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.test.TestSQLContext._
  * Note: this is only a rough example of how TGFs can be expressed, the final version will likely
  * involve a lot more sugar for cleaner use in Scala/Java/etc.
  */
-case class ExampleTGF(input: Seq[Attribute] = Seq('name, 'age)) extends Generator {
+case class ExampleTGF(input: Seq[Expression] = Seq('name, 'age)) extends Generator {
   def children = input
   protected def makeOutput() = 'nameAndAge.string :: Nil
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 10bd9f08f0238..75c0589eb208e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -17,16 +17,12 @@
 
 package org.apache.spark.sql.json
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.json.JsonRDD.{enforceCorrectType, compatibleType}
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.test.TestSQLContext._
 
-protected case class Schema(output: Seq[Attribute]) extends LeafNode
-
 class JsonSuite extends QueryTest {
   import TestJsonData._
   TestJsonData
@@ -127,6 +123,18 @@ class JsonSuite extends QueryTest {
     checkDataType(ArrayType(IntegerType), ArrayType(LongType), ArrayType(LongType))
     checkDataType(ArrayType(IntegerType), ArrayType(StringType), ArrayType(StringType))
     checkDataType(ArrayType(IntegerType), StructType(Nil), StringType)
+    checkDataType(
+      ArrayType(IntegerType, true), ArrayType(IntegerType), ArrayType(IntegerType, true))
+    checkDataType(
+      ArrayType(IntegerType, true), ArrayType(IntegerType, false), ArrayType(IntegerType, true))
+    checkDataType(
+      ArrayType(IntegerType, true), ArrayType(IntegerType, true), ArrayType(IntegerType, true))
+    checkDataType(
+      ArrayType(IntegerType, false), ArrayType(IntegerType), ArrayType(IntegerType, false))
+    checkDataType(
+      ArrayType(IntegerType, false), ArrayType(IntegerType, false), ArrayType(IntegerType, false))
+    checkDataType(
+      ArrayType(IntegerType, false), ArrayType(IntegerType, false), ArrayType(IntegerType))
 
     // StructType
     checkDataType(StructType(Nil), StructType(Nil), StructType(Nil))
@@ -164,18 +172,18 @@ class JsonSuite extends QueryTest {
   test("Primitive field and type inferring") {
     val jsonSchemaRDD = jsonRDD(primitiveFieldAndType)
 
-    val expectedSchema =
-      AttributeReference("bigInteger", DecimalType, true)() ::
-      AttributeReference("boolean", BooleanType, true)() ::
-      AttributeReference("double", DoubleType, true)() ::
-      AttributeReference("integer", IntegerType, true)() ::
-      AttributeReference("long", LongType, true)() ::
-      AttributeReference("null", StringType, true)() ::
-      AttributeReference("string", StringType, true)() :: Nil
+    val expectedSchema = StructType(
+      StructField("bigInteger", DecimalType, true) ::
+      StructField("boolean", BooleanType, true) ::
+      StructField("double", DoubleType, true) ::
+      StructField("integer", IntegerType, true) ::
+      StructField("long", LongType, true) ::
+      StructField("null", StringType, true) ::
+      StructField("string", StringType, true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -192,29 +200,30 @@ class JsonSuite extends QueryTest {
   test("Complex field and type inferring") {
     val jsonSchemaRDD = jsonRDD(complexFieldAndType)
 
-    val expectedSchema =
-      AttributeReference("arrayOfArray1", ArrayType(ArrayType(StringType)), true)() ::
-      AttributeReference("arrayOfArray2", ArrayType(ArrayType(DoubleType)), true)() ::
-      AttributeReference("arrayOfBigInteger", ArrayType(DecimalType), true)() ::
-      AttributeReference("arrayOfBoolean", ArrayType(BooleanType), true)() ::
-      AttributeReference("arrayOfDouble", ArrayType(DoubleType), true)() ::
-      AttributeReference("arrayOfInteger", ArrayType(IntegerType), true)() ::
-      AttributeReference("arrayOfLong", ArrayType(LongType), true)() ::
-      AttributeReference("arrayOfNull", ArrayType(StringType), true)() ::
-      AttributeReference("arrayOfString", ArrayType(StringType), true)() ::
-      AttributeReference("arrayOfStruct", ArrayType(
-        StructType(StructField("field1", BooleanType, true) ::
-                   StructField("field2", StringType, true) :: Nil)), true)() ::
-      AttributeReference("struct", StructType(
-        StructField("field1", BooleanType, true) ::
-        StructField("field2", DecimalType, true) :: Nil), true)() ::
-      AttributeReference("structWithArrayFields", StructType(
+    val expectedSchema = StructType(
+      StructField("arrayOfArray1", ArrayType(ArrayType(StringType)), true) ::
+      StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType)), true) ::
+      StructField("arrayOfBigInteger", ArrayType(DecimalType), true) ::
+      StructField("arrayOfBoolean", ArrayType(BooleanType), true) ::
+      StructField("arrayOfDouble", ArrayType(DoubleType), true) ::
+      StructField("arrayOfInteger", ArrayType(IntegerType), true) ::
+      StructField("arrayOfLong", ArrayType(LongType), true) ::
+      StructField("arrayOfNull", ArrayType(StringType, true), true) ::
+      StructField("arrayOfString", ArrayType(StringType), true) ::
+      StructField("arrayOfStruct", ArrayType(
+        StructType(
+          StructField("field1", BooleanType, true) ::
+          StructField("field2", StringType, true) :: Nil)), true) ::
+      StructField("struct", StructType(
+      StructField("field1", BooleanType, true) ::
+      StructField("field2", DecimalType, true) :: Nil), true) ::
+      StructField("structWithArrayFields", StructType(
         StructField("field1", ArrayType(IntegerType), true) ::
-        StructField("field2", ArrayType(StringType), true) :: Nil), true)() :: Nil
+        StructField("field2", ArrayType(StringType), true) :: Nil), true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     // Access elements of a primitive array.
     checkAnswer(
@@ -282,7 +291,7 @@ class JsonSuite extends QueryTest {
 
   ignore("Complex field and type inferring (Ignored)") {
     val jsonSchemaRDD = jsonRDD(complexFieldAndType)
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     // Right now, "field1" and "field2" are treated as aliases. We should fix it.
     checkAnswer(
@@ -301,17 +310,17 @@ class JsonSuite extends QueryTest {
   test("Type conflict in primitive field values") {
     val jsonSchemaRDD = jsonRDD(primitiveFieldValueTypeConflict)
 
-    val expectedSchema =
-      AttributeReference("num_bool", StringType, true)() ::
-      AttributeReference("num_num_1", LongType, true)() ::
-      AttributeReference("num_num_2", DecimalType, true)() ::
-      AttributeReference("num_num_3", DoubleType, true)() ::
-      AttributeReference("num_str", StringType, true)() ::
-      AttributeReference("str_bool", StringType, true)() :: Nil
+    val expectedSchema = StructType(
+      StructField("num_bool", StringType, true) ::
+      StructField("num_num_1", LongType, true) ::
+      StructField("num_num_2", DecimalType, true) ::
+      StructField("num_num_3", DoubleType, true) ::
+      StructField("num_str", StringType, true) ::
+      StructField("str_bool", StringType, true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -365,7 +374,7 @@ class JsonSuite extends QueryTest {
 
   ignore("Type conflict in primitive field values (Ignored)") {
     val jsonSchemaRDD = jsonRDD(primitiveFieldValueTypeConflict)
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     // Right now, the analyzer does not promote strings in a boolean expreesion.
     // Number and Boolean conflict: resolve the type as boolean in this query.
@@ -426,17 +435,17 @@ class JsonSuite extends QueryTest {
   test("Type conflict in complex field values") {
     val jsonSchemaRDD = jsonRDD(complexFieldValueTypeConflict)
 
-    val expectedSchema =
-      AttributeReference("array", ArrayType(IntegerType), true)() ::
-      AttributeReference("num_struct", StringType, true)() ::
-      AttributeReference("str_array", StringType, true)() ::
-      AttributeReference("struct", StructType(
-        StructField("field", StringType, true) :: Nil), true)() ::
-      AttributeReference("struct_array", StringType, true)() :: Nil
+    val expectedSchema = StructType(
+      StructField("array", ArrayType(IntegerType), true) ::
+      StructField("num_struct", StringType, true) ::
+      StructField("str_array", StringType, true) ::
+      StructField("struct", StructType(
+        StructField("field", StringType, true) :: Nil), true) ::
+      StructField("struct_array", StringType, true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -450,22 +459,24 @@ class JsonSuite extends QueryTest {
   test("Type conflict in array elements") {
     val jsonSchemaRDD = jsonRDD(arrayElementTypeConflict)
 
-    val expectedSchema =
-      AttributeReference("array", ArrayType(StringType), true)() :: Nil
+    val expectedSchema = StructType(
+      StructField("array1", ArrayType(StringType, true), true) ::
+      StructField("array2", ArrayType(StructType(
+        StructField("field", LongType, true) :: Nil)), true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
       Seq(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]",
-        """{"field":str}""")) :: Nil
+        """{"field":str}"""), Seq(Seq(214748364700L), Seq(1))) :: Nil
     )
 
     // Treat an element as a number.
     checkAnswer(
-      sql("select array[0] + 1 from jsonTable"),
+      sql("select array1[0] + 1 from jsonTable"),
       2
     )
   }
@@ -473,17 +484,17 @@ class JsonSuite extends QueryTest {
   test("Handling missing fields") {
     val jsonSchemaRDD = jsonRDD(missingFields)
 
-    val expectedSchema =
-      AttributeReference("a", BooleanType, true)() ::
-      AttributeReference("b", LongType, true)() ::
-      AttributeReference("c", ArrayType(IntegerType), true)() ::
-      AttributeReference("d", StructType(
-        StructField("field", BooleanType, true) :: Nil), true)() ::
-      AttributeReference("e", StringType, true)() :: Nil
+    val expectedSchema = StructType(
+      StructField("a", BooleanType, true) ::
+      StructField("b", LongType, true) ::
+      StructField("c", ArrayType(IntegerType), true) ::
+      StructField("d", StructType(
+        StructField("field", BooleanType, true) :: Nil), true) ::
+      StructField("e", StringType, true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
   }
 
   test("Loading a JSON dataset from a text file") {
@@ -492,18 +503,18 @@ class JsonSuite extends QueryTest {
     primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
     val jsonSchemaRDD = jsonFile(path)
 
-    val expectedSchema =
-      AttributeReference("bigInteger", DecimalType, true)() ::
-      AttributeReference("boolean", BooleanType, true)() ::
-      AttributeReference("double", DoubleType, true)() ::
-      AttributeReference("integer", IntegerType, true)() ::
-      AttributeReference("long", LongType, true)() ::
-      AttributeReference("null", StringType, true)() ::
-      AttributeReference("string", StringType, true)() :: Nil
+    val expectedSchema = StructType(
+      StructField("bigInteger", DecimalType, true) ::
+      StructField("boolean", BooleanType, true) ::
+      StructField("double", DoubleType, true) ::
+      StructField("integer", IntegerType, true) ::
+      StructField("long", LongType, true) ::
+      StructField("null", StringType, true) ::
+      StructField("string", StringType, true) :: Nil)
 
-    comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
+    assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -516,4 +527,53 @@ class JsonSuite extends QueryTest {
       "this is a simple string.") :: Nil
     )
   }
+
+  test("Applying schemas") {
+    val file = getTempFilePath("json")
+    val path = file.toString
+    primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
+
+    val schema = StructType(
+      StructField("bigInteger", DecimalType, true) ::
+      StructField("boolean", BooleanType, true) ::
+      StructField("double", DoubleType, true) ::
+      StructField("integer", IntegerType, true) ::
+      StructField("long", LongType, true) ::
+      StructField("null", StringType, true) ::
+      StructField("string", StringType, true) :: Nil)
+
+    val jsonSchemaRDD1 = jsonFile(path, schema)
+
+    assert(schema === jsonSchemaRDD1.schema)
+
+    jsonSchemaRDD1.registerTempTable("jsonTable1")
+
+    checkAnswer(
+      sql("select * from jsonTable1"),
+      (BigDecimal("92233720368547758070"),
+      true,
+      1.7976931348623157E308,
+      10,
+      21474836470L,
+      null,
+      "this is a simple string.") :: Nil
+    )
+
+    val jsonSchemaRDD2 = jsonRDD(primitiveFieldAndType, schema)
+
+    assert(schema === jsonSchemaRDD2.schema)
+
+    jsonSchemaRDD2.registerTempTable("jsonTable2")
+
+    checkAnswer(
+      sql("select * from jsonTable2"),
+      (BigDecimal("92233720368547758070"),
+      true,
+      1.7976931348623157E308,
+      10,
+      21474836470L,
+      null,
+      "this is a simple string.") :: Nil
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index 065e04046e8a6..d0180f3754f22 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -72,7 +72,8 @@ object TestJsonData {
 
   val arrayElementTypeConflict =
     TestSQLContext.sparkContext.parallelize(
-      """{"array": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}]}""" :: Nil)
+      """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
+          "array2": [{"field":214748364700}, {"field":1}]}""" :: Nil)
 
   val missingFields =
     TestSQLContext.sparkContext.parallelize(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 2ca0c1cdcbeca..9933575038bd3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -25,6 +25,7 @@ import parquet.schema.MessageTypeParser
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.Job
+
 import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{SqlLexical, SqlParser}
@@ -32,6 +33,7 @@ import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types.{BooleanType, IntegerType}
 import org.apache.spark.sql.catalyst.util.getTempFilePath
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.util.Utils
@@ -65,7 +67,22 @@ case class AllDataTypes(
     doubleField: Double,
     shortField: Short,
     byteField: Byte,
-    booleanField: Boolean)
+    booleanField: Boolean,
+    binaryField: Array[Byte])
+
+case class AllDataTypesWithNonPrimitiveType(
+    stringField: String,
+    intField: Int,
+    longField: Long,
+    floatField: Float,
+    doubleField: Double,
+    shortField: Short,
+    byteField: Byte,
+    booleanField: Boolean,
+    binaryField: Array[Byte],
+    array: Seq[Int],
+    map: Map[Int, String],
+    data: Data)
 
 class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
   TestData // Load test data tables.
@@ -84,9 +101,9 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     ParquetTestData.writeNestedFile3()
     ParquetTestData.writeNestedFile4()
     testRDD = parquetFile(ParquetTestData.testDir.toString)
-    testRDD.registerAsTable("testsource")
+    testRDD.registerTempTable("testsource")
     parquetFile(ParquetTestData.testFilterDir.toString)
-      .registerAsTable("testfiltersource")
+      .registerTempTable("testfiltersource")
   }
 
   override def afterAll() {
@@ -103,7 +120,8 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val tempDir = getTempFilePath("parquetTest").getCanonicalPath
     val range = (0 to 255)
     TestSQLContext.sparkContext.parallelize(range)
-      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
+        (0 to x).map(_.toByte).toArray))
       .saveAsParquetFile(tempDir)
     val result = parquetFile(tempDir).collect()
     range.foreach {
@@ -116,6 +134,34 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
         assert(result(i).getShort(5) === i.toShort)
         assert(result(i).getByte(6) === i.toByte)
         assert(result(i).getBoolean(7) === (i % 2 == 0))
+        assert(result(i)(8) === (0 to i).map(_.toByte).toArray)
+    }
+  }
+
+  test("Read/Write All Types with non-primitive type") {
+    val tempDir = getTempFilePath("parquetTest").getCanonicalPath
+    val range = (0 to 255)
+    TestSQLContext.sparkContext.parallelize(range)
+      .map(x => AllDataTypesWithNonPrimitiveType(
+        s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
+        (0 to x).map(_.toByte).toArray,
+        (0 until x), (0 until x).map(i => i -> s"$i").toMap, Data((0 until x), Nested(x, s"$x"))))
+      .saveAsParquetFile(tempDir)
+    val result = parquetFile(tempDir).collect()
+    range.foreach {
+      i =>
+        assert(result(i).getString(0) == s"$i", s"row $i String field did not match, got ${result(i).getString(0)}")
+        assert(result(i).getInt(1) === i)
+        assert(result(i).getLong(2) === i.toLong)
+        assert(result(i).getFloat(3) === i.toFloat)
+        assert(result(i).getDouble(4) === i.toDouble)
+        assert(result(i).getShort(5) === i.toShort)
+        assert(result(i).getByte(6) === i.toByte)
+        assert(result(i).getBoolean(7) === (i % 2 == 0))
+        assert(result(i)(8) === (0 to i).map(_.toByte).toArray)
+        assert(result(i)(9) === (0 until i))
+        assert(result(i)(10) === (0 until i).map(i => i -> s"$i").toMap)
+        assert(result(i)(11) === new GenericRow(Array[Any]((0 until i), new GenericRow(Array[Any](i, s"$i")))))
     }
   }
 
@@ -163,18 +209,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   }
 
   test("Projection of simple Parquet file") {
-    val scanner = new ParquetTableScan(
-      ParquetTestData.testData.output,
-      ParquetTestData.testData,
-      Seq())(TestSQLContext)
-    val projected = scanner.pruneColumns(ParquetTypesConverter
-      .convertToAttributes(MessageTypeParser
-      .parseMessageType(ParquetTestData.subTestSchema)))
-    assert(projected.output.size === 2)
-    val result = projected
-      .execute()
-      .map(_.copy())
-      .collect()
+    val result = ParquetTestData.testData.select('myboolean, 'mylong).collect()
     result.zipWithIndex.foreach {
       case (row, index) => {
           if (index % 3 == 0)
@@ -212,7 +247,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   test("Creating case class RDD table") {
     TestSQLContext.sparkContext.parallelize((1 to 100))
       .map(i => TestRDDEntry(i, s"val_$i"))
-      .registerAsTable("tmp")
+      .registerTempTable("tmp")
     val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
     var counter = 1
     rdd.foreach {
@@ -231,7 +266,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
       .map(i => TestRDDEntry(i, s"val_$i"))
     rdd.saveAsParquetFile(path)
     val readFile = parquetFile(path)
-    readFile.registerAsTable("tmpx")
+    readFile.registerTempTable("tmpx")
     val rdd_copy = sql("SELECT * FROM tmpx").collect()
     val rdd_orig = rdd.collect()
     for(i <- 0 to 99) {
@@ -245,9 +280,9 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val dirname = Utils.createTempDir()
     val source_rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
       .map(i => TestRDDEntry(i, s"val_$i"))
-    source_rdd.registerAsTable("source")
+    source_rdd.registerTempTable("source")
     val dest_rdd = createParquetFile[TestRDDEntry](dirname.toString)
-    dest_rdd.registerAsTable("dest")
+    dest_rdd.registerTempTable("dest")
     sql("INSERT OVERWRITE INTO dest SELECT * FROM source").collect()
     val rdd_copy1 = sql("SELECT * FROM dest").collect()
     assert(rdd_copy1.size === 100)
@@ -298,7 +333,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   }
 
   test("save and load case class RDD with Nones as parquet") {
-    val data = OptionalReflectData(null, null, null, null, null)
+    val data = OptionalReflectData(None, None, None, None, None)
     val rdd = sparkContext.parallelize(data :: Nil)
 
     val file = getTempFilePath("parquet")
@@ -512,7 +547,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir1.toString)
       .toSchemaRDD
-    data.registerAsTable("data")
+    data.registerTempTable("data")
     val query = nestedParserSqlContext.sql("SELECT owner, contacts[1].name FROM data")
     val tmp = query.collect()
     assert(tmp.size === 2)
@@ -527,7 +562,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir2.toString)
       .toSchemaRDD
-    data.registerAsTable("data")
+    data.registerTempTable("data")
     val result1 = nestedParserSqlContext.sql("SELECT entries[0].value FROM data").collect()
     assert(result1.size === 1)
     assert(result1(0).size === 1)
@@ -554,7 +589,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir3.toString)
       .toSchemaRDD
-    data.registerAsTable("data")
+    data.registerTempTable("data")
     val result1 = nestedParserSqlContext.sql("SELECT booleanNumberPairs[0].value[0].truth FROM data").collect()
     assert(result1.size === 1)
     assert(result1(0).size === 1)
@@ -573,7 +608,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = TestSQLContext
       .parquetFile(ParquetTestData.testNestedDir4.toString)
       .toSchemaRDD
-    data.registerAsTable("mapTable")
+    data.registerTempTable("mapTable")
     val result1 = sql("SELECT data1 FROM mapTable").collect()
     assert(result1.size === 1)
     assert(result1(0)(0)
@@ -590,7 +625,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir4.toString)
       .toSchemaRDD
-    data.registerAsTable("mapTable")
+    data.registerTempTable("mapTable")
     val result1 = nestedParserSqlContext.sql("SELECT data2 FROM mapTable").collect()
     assert(result1.size === 1)
     val entry1 = result1(0)(0)
@@ -623,7 +658,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     nestedParserSqlContext
       .parquetFile(tmpdir.toString)
       .toSchemaRDD
-      .registerAsTable("tmpcopy")
+      .registerTempTable("tmpcopy")
     val tmpdata = nestedParserSqlContext.sql("SELECT owner, contacts[1].name FROM tmpcopy").collect()
     assert(tmpdata.size === 2)
     assert(tmpdata(0).size === 2)
@@ -644,7 +679,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     nestedParserSqlContext
       .parquetFile(tmpdir.toString)
       .toSchemaRDD
-      .registerAsTable("tmpmapcopy")
+      .registerTempTable("tmpmapcopy")
     val result1 = nestedParserSqlContext.sql("""SELECT data1["key2"] FROM tmpmapcopy""").collect()
     assert(result1.size === 1)
     assert(result1(0)(0) === 2)
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
new file mode 100644
index 0000000000000..7fac90fdc596d
--- /dev/null
+++ b/sql/hive-thriftserver/pom.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>1.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-hive-thriftserver_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Hive</name>
+  <url>http://spark.apache.org/</url>
+  <properties>
+    <sbt.project.name>hive-thriftserver</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-hive_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.spark-project.hive</groupId>
+      <artifactId>hive-cli</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.spark-project.hive</groupId>
+      <artifactId>hive-jdbc</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.spark-project.hive</groupId>
+      <artifactId>hive-beeline</artifactId>
+      <version>${hive.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
new file mode 100644
index 0000000000000..08d3f983d9e71
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import scala.collection.JavaConversions._
+
+import org.apache.commons.logging.LogFactory
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hive.service.cli.thrift.ThriftBinaryCLIService
+import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
+
+/**
+ * The main entry point for the Spark SQL port of HiveServer2.  Starts up a `SparkSQLContext` and a
+ * `HiveThriftServer2` thrift server.
+ */
+private[hive] object HiveThriftServer2 extends Logging {
+  var LOG = LogFactory.getLog(classOf[HiveServer2])
+
+  def main(args: Array[String]) {
+    val optionsProcessor = new ServerOptionsProcessor("HiveThriftServer2")
+
+    if (!optionsProcessor.process(args)) {
+      logWarning("Error starting HiveThriftServer2 with given arguments")
+      System.exit(-1)
+    }
+
+    val ss = new SessionState(new HiveConf(classOf[SessionState]))
+
+    // Set all properties specified via command line.
+    val hiveConf: HiveConf = ss.getConf
+    hiveConf.getAllProperties.toSeq.sortBy(_._1).foreach { case (k, v) =>
+      logDebug(s"HiveConf var: $k=$v")
+    }
+
+    SessionState.start(ss)
+
+    logInfo("Starting SparkContext")
+    SparkSQLEnv.init()
+    SessionState.start(ss)
+
+    Runtime.getRuntime.addShutdownHook(
+      new Thread() {
+        override def run() {
+          SparkSQLEnv.sparkContext.stop()
+        }
+      }
+    )
+
+    try {
+      val server = new HiveThriftServer2(SparkSQLEnv.hiveContext)
+      server.init(hiveConf)
+      server.start()
+      logInfo("HiveThriftServer2 started")
+    } catch {
+      case e: Exception =>
+        logError("Error starting HiveThriftServer2", e)
+        System.exit(-1)
+    }
+  }
+}
+
+private[hive] class HiveThriftServer2(hiveContext: HiveContext)
+  extends HiveServer2
+  with ReflectedCompositeService {
+
+  override def init(hiveConf: HiveConf) {
+    val sparkSqlCliService = new SparkSQLCLIService(hiveContext)
+    setSuperField(this, "cliService", sparkSqlCliService)
+    addService(sparkSqlCliService)
+
+    val thriftCliService = new ThriftBinaryCLIService(sparkSqlCliService)
+    setSuperField(this, "thriftCLIService", thriftCliService)
+    addService(thriftCliService)
+
+    initCompositeService(hiveConf)
+  }
+}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala
new file mode 100644
index 0000000000000..599294dfbb7d7
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+private[hive] object ReflectionUtils {
+  def setSuperField(obj : Object, fieldName: String, fieldValue: Object) {
+    setAncestorField(obj, 1, fieldName, fieldValue)
+  }
+
+  def setAncestorField(obj: AnyRef, level: Int, fieldName: String, fieldValue: AnyRef) {
+    val ancestor = Iterator.iterate[Class[_]](obj.getClass)(_.getSuperclass).drop(level).next()
+    val field = ancestor.getDeclaredField(fieldName)
+    field.setAccessible(true)
+    field.set(obj, fieldValue)
+  }
+
+  def getSuperField[T](obj: AnyRef, fieldName: String): T = {
+    getAncestorField[T](obj, 1, fieldName)
+  }
+
+  def getAncestorField[T](clazz: Object, level: Int, fieldName: String): T = {
+    val ancestor = Iterator.iterate[Class[_]](clazz.getClass)(_.getSuperclass).drop(level).next()
+    val field = ancestor.getDeclaredField(fieldName)
+    field.setAccessible(true)
+    field.get(clazz).asInstanceOf[T]
+  }
+
+  def invokeStatic(clazz: Class[_], methodName: String, args: (Class[_], AnyRef)*): AnyRef = {
+    invoke(clazz, null, methodName, args: _*)
+  }
+
+  def invoke(
+      clazz: Class[_],
+      obj: AnyRef,
+      methodName: String,
+      args: (Class[_], AnyRef)*): AnyRef = {
+
+    val (types, values) = args.unzip
+    val method = clazz.getDeclaredMethod(methodName, types: _*)
+    method.setAccessible(true)
+    method.invoke(obj, values.toSeq: _*)
+  }
+}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
new file mode 100755
index 0000000000000..4d0c506c5a397
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import scala.collection.JavaConversions._
+
+import java.io._
+import java.util.{ArrayList => JArrayList}
+
+import jline.{ConsoleReader, History}
+import org.apache.commons.lang.StringUtils
+import org.apache.commons.logging.LogFactory
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.cli.{CliDriver, CliSessionState, OptionsProcessor}
+import org.apache.hadoop.hive.common.LogUtils.LogInitializationException
+import org.apache.hadoop.hive.common.{HiveInterruptCallback, HiveInterruptUtils, LogUtils}
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.exec.Utilities
+import org.apache.hadoop.hive.ql.processors.{CommandProcessor, CommandProcessorFactory}
+import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.shims.ShimLoader
+import org.apache.thrift.transport.TSocket
+
+import org.apache.spark.Logging
+
+private[hive] object SparkSQLCLIDriver {
+  private var prompt = "spark-sql"
+  private var continuedPrompt = "".padTo(prompt.length, ' ')
+  private var transport:TSocket = _
+
+  installSignalHandler()
+
+  /**
+   * Install an interrupt callback to cancel all Spark jobs. In Hive's CliDriver#processLine(),
+   * a signal handler will invoke this registered callback if a Ctrl+C signal is detected while
+   * a command is being processed by the current thread.
+   */
+  def installSignalHandler() {
+    HiveInterruptUtils.add(new HiveInterruptCallback {
+      override def interrupt() {
+        // Handle remote execution mode
+        if (SparkSQLEnv.sparkContext != null) {
+          SparkSQLEnv.sparkContext.cancelAllJobs()
+        } else {
+          if (transport != null) {
+            // Force closing of TCP connection upon session termination
+            transport.getSocket.close()
+          }
+        }
+      }
+    })
+  }
+
+  def main(args: Array[String]) {
+    val oproc = new OptionsProcessor()
+    if (!oproc.process_stage1(args)) {
+      System.exit(1)
+    }
+
+    // NOTE: It is critical to do this here so that log4j is reinitialized
+    // before any of the other core hive classes are loaded
+    var logInitFailed = false
+    var logInitDetailMessage: String = null
+    try {
+      logInitDetailMessage = LogUtils.initHiveLog4j()
+    } catch {
+      case e: LogInitializationException =>
+        logInitFailed = true
+        logInitDetailMessage = e.getMessage
+    }
+
+    val sessionState = new CliSessionState(new HiveConf(classOf[SessionState]))
+
+    sessionState.in = System.in
+    try {
+      sessionState.out = new PrintStream(System.out, true, "UTF-8")
+      sessionState.info = new PrintStream(System.err, true, "UTF-8")
+      sessionState.err = new PrintStream(System.err, true, "UTF-8")
+    } catch {
+      case e: UnsupportedEncodingException => System.exit(3)
+    }
+
+    if (!oproc.process_stage2(sessionState)) {
+      System.exit(2)
+    }
+
+    if (!sessionState.getIsSilent) {
+      if (logInitFailed) System.err.println(logInitDetailMessage)
+      else SessionState.getConsole.printInfo(logInitDetailMessage)
+    }
+
+    // Set all properties specified via command line.
+    val conf: HiveConf = sessionState.getConf
+    sessionState.cmdProperties.entrySet().foreach { item: java.util.Map.Entry[Object, Object] =>
+      conf.set(item.getKey.asInstanceOf[String], item.getValue.asInstanceOf[String])
+      sessionState.getOverriddenConfigurations.put(
+        item.getKey.asInstanceOf[String], item.getValue.asInstanceOf[String])
+    }
+
+    SessionState.start(sessionState)
+
+    // Clean up after we exit
+    Runtime.getRuntime.addShutdownHook(
+      new Thread() {
+        override def run() {
+          SparkSQLEnv.stop()
+        }
+      }
+    )
+
+    // "-h" option has been passed, so connect to Hive thrift server.
+    if (sessionState.getHost != null) {
+      sessionState.connect()
+      if (sessionState.isRemoteMode) {
+        prompt = s"[${sessionState.getHost}:${sessionState.getPort}]" + prompt
+        continuedPrompt = "".padTo(prompt.length, ' ')
+      }
+    }
+
+    if (!sessionState.isRemoteMode && !ShimLoader.getHadoopShims.usesJobShell()) {
+      // Hadoop-20 and above - we need to augment classpath using hiveconf
+      // components.
+      // See also: code in ExecDriver.java
+      var loader = conf.getClassLoader
+      val auxJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS)
+      if (StringUtils.isNotBlank(auxJars)) {
+        loader = Utilities.addToClassPath(loader, StringUtils.split(auxJars, ","))
+      }
+      conf.setClassLoader(loader)
+      Thread.currentThread().setContextClassLoader(loader)
+    }
+
+    val cli = new SparkSQLCLIDriver
+    cli.setHiveVariables(oproc.getHiveVariables)
+
+    // TODO work around for set the log output to console, because the HiveContext
+    // will set the output into an invalid buffer.
+    sessionState.in = System.in
+    try {
+      sessionState.out = new PrintStream(System.out, true, "UTF-8")
+      sessionState.info = new PrintStream(System.err, true, "UTF-8")
+      sessionState.err = new PrintStream(System.err, true, "UTF-8")
+    } catch {
+      case e: UnsupportedEncodingException => System.exit(3)
+    }
+
+    // Execute -i init files (always in silent mode)
+    cli.processInitFiles(sessionState)
+
+    if (sessionState.execString != null) {
+      System.exit(cli.processLine(sessionState.execString))
+    }
+
+    try {
+      if (sessionState.fileName != null) {
+        System.exit(cli.processFile(sessionState.fileName))
+      }
+    } catch {
+      case e: FileNotFoundException =>
+        System.err.println(s"Could not open input file for reading. (${e.getMessage})")
+        System.exit(3)
+    }
+
+    val reader = new ConsoleReader()
+    reader.setBellEnabled(false)
+    // reader.setDebug(new PrintWriter(new FileWriter("writer.debug", true)))
+    CliDriver.getCommandCompletor.foreach((e) => reader.addCompletor(e))
+
+    val historyDirectory = System.getProperty("user.home")
+
+    try {
+      if (new File(historyDirectory).exists()) {
+        val historyFile = historyDirectory + File.separator + ".hivehistory"
+        reader.setHistory(new History(new File(historyFile)))
+      } else {
+        System.err.println("WARNING: Directory for Hive history file: " + historyDirectory +
+                           " does not exist.   History will not be available during this session.")
+      }
+    } catch {
+      case e: Exception =>
+        System.err.println("WARNING: Encountered an error while trying to initialize Hive's " +
+                           "history file.  History will not be available during this session.")
+        System.err.println(e.getMessage)
+    }
+
+    val clientTransportTSocketField = classOf[CliSessionState].getDeclaredField("transport")
+    clientTransportTSocketField.setAccessible(true)
+
+    transport = clientTransportTSocketField.get(sessionState).asInstanceOf[TSocket]
+
+    var ret = 0
+    var prefix = ""
+    val currentDB = ReflectionUtils.invokeStatic(classOf[CliDriver], "getFormattedDb",
+      classOf[HiveConf] -> conf, classOf[CliSessionState] -> sessionState)
+
+    def promptWithCurrentDB = s"$prompt$currentDB"
+    def continuedPromptWithDBSpaces = continuedPrompt + ReflectionUtils.invokeStatic(
+      classOf[CliDriver], "spacesForString", classOf[String] -> currentDB)
+
+    var currentPrompt = promptWithCurrentDB
+    var line = reader.readLine(currentPrompt + "> ")
+
+    while (line != null) {
+      if (prefix.nonEmpty) {
+        prefix += '\n'
+      }
+
+      if (line.trim().endsWith(";") && !line.trim().endsWith("\\;")) {
+        line = prefix + line
+        ret = cli.processLine(line, true)
+        prefix = ""
+        currentPrompt = promptWithCurrentDB
+      } else {
+        prefix = prefix + line
+        currentPrompt = continuedPromptWithDBSpaces
+      }
+
+      line = reader.readLine(currentPrompt + "> ")
+    }
+
+    sessionState.close()
+
+    System.exit(ret)
+  }
+}
+
+private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
+  private val sessionState = SessionState.get().asInstanceOf[CliSessionState]
+
+  private val LOG = LogFactory.getLog("CliDriver")
+
+  private val console = new SessionState.LogHelper(LOG)
+
+  private val conf: Configuration =
+    if (sessionState != null) sessionState.getConf else new Configuration()
+
+  // Force initializing SparkSQLEnv. This is put here but not object SparkSQLCliDriver
+  // because the Hive unit tests do not go through the main() code path.
+  if (!sessionState.isRemoteMode) {
+    SparkSQLEnv.init()
+  }
+
+  override def processCmd(cmd: String): Int = {
+    val cmd_trimmed: String = cmd.trim()
+    val tokens: Array[String] = cmd_trimmed.split("\\s+")
+    val cmd_1: String = cmd_trimmed.substring(tokens(0).length()).trim()
+    if (cmd_trimmed.toLowerCase.equals("quit") ||
+      cmd_trimmed.toLowerCase.equals("exit") ||
+      tokens(0).equalsIgnoreCase("source") ||
+      cmd_trimmed.startsWith("!") ||
+      tokens(0).toLowerCase.equals("list") ||
+      sessionState.isRemoteMode) {
+      val start = System.currentTimeMillis()
+      super.processCmd(cmd)
+      val end = System.currentTimeMillis()
+      val timeTaken: Double = (end - start) / 1000.0
+      console.printInfo(s"Time taken: $timeTaken seconds")
+      0
+    } else {
+      var ret = 0
+      val hconf = conf.asInstanceOf[HiveConf]
+      val proc: CommandProcessor = CommandProcessorFactory.get(tokens(0), hconf)
+
+      if (proc != null) {
+        if (proc.isInstanceOf[Driver]) {
+          val driver = new SparkSQLDriver
+
+          driver.init()
+          val out = sessionState.out
+          val start:Long = System.currentTimeMillis()
+          if (sessionState.getIsVerbose) {
+            out.println(cmd)
+          }
+
+          val rc = driver.run(cmd)
+          ret = rc.getResponseCode
+          if (ret != 0) {
+            console.printError(rc.getErrorMessage())
+            driver.close()
+            return ret
+          }
+
+          val res = new JArrayList[String]()
+
+          if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CLI_PRINT_HEADER)) {
+            // Print the column names.
+            Option(driver.getSchema.getFieldSchemas).map { fields =>
+              out.println(fields.map(_.getName).mkString("\t"))
+            }
+          }
+
+          try {
+            while (!out.checkError() && driver.getResults(res)) {
+              res.foreach(out.println)
+              res.clear()
+            }
+          } catch {
+            case e:IOException =>
+              console.printError(
+                s"""Failed with exception ${e.getClass.getName}: ${e.getMessage}
+                   |${org.apache.hadoop.util.StringUtils.stringifyException(e)}
+                 """.stripMargin)
+              ret = 1
+          }
+
+          val cret = driver.close()
+          if (ret == 0) {
+            ret = cret
+          }
+
+          val end = System.currentTimeMillis()
+          if (end > start) {
+            val timeTaken:Double = (end - start) / 1000.0
+            console.printInfo(s"Time taken: $timeTaken seconds", null)
+          }
+
+          // Destroy the driver to release all the locks.
+          driver.destroy()
+        } else {
+          if (sessionState.getIsVerbose) {
+            sessionState.out.println(tokens(0) + " " + cmd_1)
+          }
+          ret = proc.run(cmd_1).getResponseCode
+        }
+      }
+      ret
+    }
+  }
+}
+
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
new file mode 100644
index 0000000000000..42cbf363b274f
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import scala.collection.JavaConversions._
+
+import java.io.IOException
+import java.util.{List => JList}
+import javax.security.auth.login.LoginException
+
+import org.apache.commons.logging.Log
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.shims.ShimLoader
+import org.apache.hive.service.Service.STATE
+import org.apache.hive.service.auth.HiveAuthFactory
+import org.apache.hive.service.cli.CLIService
+import org.apache.hive.service.{AbstractService, Service, ServiceException}
+
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
+
+private[hive] class SparkSQLCLIService(hiveContext: HiveContext)
+  extends CLIService
+  with ReflectedCompositeService {
+
+  override def init(hiveConf: HiveConf) {
+    setSuperField(this, "hiveConf", hiveConf)
+
+    val sparkSqlSessionManager = new SparkSQLSessionManager(hiveContext)
+    setSuperField(this, "sessionManager", sparkSqlSessionManager)
+    addService(sparkSqlSessionManager)
+
+    try {
+      HiveAuthFactory.loginFromKeytab(hiveConf)
+      val serverUserName = ShimLoader.getHadoopShims
+        .getShortUserName(ShimLoader.getHadoopShims.getUGIForConf(hiveConf))
+      setSuperField(this, "serverUserName", serverUserName)
+    } catch {
+      case e @ (_: IOException | _: LoginException) =>
+        throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
+    }
+
+    initCompositeService(hiveConf)
+  }
+}
+
+private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
+  def initCompositeService(hiveConf: HiveConf) {
+    // Emulating `CompositeService.init(hiveConf)`
+    val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
+    serviceList.foreach(_.init(hiveConf))
+
+    // Emulating `AbstractService.init(hiveConf)`
+    invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
+    setAncestorField(this, 3, "hiveConf", hiveConf)
+    invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED)
+    getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.")
+  }
+}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
new file mode 100644
index 0000000000000..7463df1f47d43
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import scala.collection.JavaConversions._
+
+import java.util.{ArrayList => JArrayList}
+
+import org.apache.commons.lang.exception.ExceptionUtils
+import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
+import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
+
+private[hive] class SparkSQLDriver(val context: HiveContext = SparkSQLEnv.hiveContext)
+  extends Driver with Logging {
+
+  private var tableSchema: Schema = _
+  private var hiveResponse: Seq[String] = _
+
+  override def init(): Unit = {
+  }
+
+  private def getResultSetSchema(query: context.QueryExecution): Schema = {
+    val analyzed = query.analyzed
+    logDebug(s"Result Schema: ${analyzed.output}")
+    if (analyzed.output.size == 0) {
+      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
+    } else {
+      val fieldSchemas = analyzed.output.map { attr =>
+        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
+      }
+
+      new Schema(fieldSchemas, null)
+    }
+  }
+
+  override def run(command: String): CommandProcessorResponse = {
+    // TODO unify the error code
+    try {
+      val execution = context.executePlan(context.sql(command).logicalPlan)
+      hiveResponse = execution.stringResult()
+      tableSchema = getResultSetSchema(execution)
+      new CommandProcessorResponse(0)
+    } catch {
+      case cause: Throwable =>
+        logError(s"Failed in [$command]", cause)
+        new CommandProcessorResponse(-3, ExceptionUtils.getFullStackTrace(cause), null)
+    }
+  }
+
+  override def close(): Int = {
+    hiveResponse = null
+    tableSchema = null
+    0
+  }
+
+  override def getSchema: Schema = tableSchema
+
+  override def getResults(res: JArrayList[String]): Boolean = {
+    if (hiveResponse == null) {
+      false
+    } else {
+      res.addAll(hiveResponse)
+      hiveResponse = null
+      true
+    }
+  }
+
+  override def destroy() {
+    super.destroy()
+    hiveResponse = null
+    tableSchema = null
+  }
+}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
new file mode 100644
index 0000000000000..582264eb59f83
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import org.apache.hadoop.hive.ql.session.SessionState
+
+import org.apache.spark.scheduler.{SplitInfo, StatsReportListener}
+import org.apache.spark.Logging
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+/** A singleton object for the master program. The slaves should not access this. */
+private[hive] object SparkSQLEnv extends Logging {
+  logDebug("Initializing SparkSQLEnv")
+
+  var hiveContext: HiveContext = _
+  var sparkContext: SparkContext = _
+
+  def init() {
+    if (hiveContext == null) {
+      sparkContext = new SparkContext(new SparkConf()
+        .setAppName(s"SparkSQL::${java.net.InetAddress.getLocalHost.getHostName}"))
+
+      sparkContext.addSparkListener(new StatsReportListener())
+
+      hiveContext = new HiveContext(sparkContext) {
+        @transient override lazy val sessionState = SessionState.get()
+        @transient override lazy val hiveconf = sessionState.getConf
+      }
+    }
+  }
+
+  /** Cleans up and shuts down the Spark SQL environments. */
+  def stop() {
+    logDebug("Shutting down Spark SQL Environment")
+    // Stop the SparkContext
+    if (SparkSQLEnv.sparkContext != null) {
+      sparkContext.stop()
+      sparkContext = null
+      hiveContext = null
+    }
+  }
+}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
new file mode 100644
index 0000000000000..6b3275b4eaf04
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import java.util.concurrent.Executors
+
+import org.apache.commons.logging.Log
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hive.service.cli.session.SessionManager
+
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
+import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager
+
+private[hive] class SparkSQLSessionManager(hiveContext: HiveContext)
+  extends SessionManager
+  with ReflectedCompositeService {
+
+  override def init(hiveConf: HiveConf) {
+    setSuperField(this, "hiveConf", hiveConf)
+
+    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
+    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
+    getAncestorField[Log](this, 3, "LOG").info(
+      s"HiveServer2: Async execution pool size $backgroundPoolSize")
+
+    val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
+    setSuperField(this, "operationManager", sparkSqlOperationManager)
+    addService(sparkSqlOperationManager)
+
+    initCompositeService(hiveConf)
+  }
+}
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
new file mode 100644
index 0000000000000..dee092159dd4c
--- /dev/null
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver.server
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.math.{random, round}
+
+import java.sql.Timestamp
+import java.util.{Map => JMap}
+
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
+import org.apache.hadoop.hive.metastore.api.FieldSchema
+import org.apache.hive.service.cli._
+import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
+import org.apache.hive.service.cli.session.HiveSession
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.hive.thriftserver.ReflectionUtils
+import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
+import org.apache.spark.sql.{SchemaRDD, Row => SparkRow}
+
+/**
+ * Executes queries using Spark SQL, and maintains a list of handles to active queries.
+ */
+class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManager with Logging {
+  val handleToOperation = ReflectionUtils
+    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")
+
+  override def newExecuteStatementOperation(
+      parentSession: HiveSession,
+      statement: String,
+      confOverlay: JMap[String, String],
+      async: Boolean): ExecuteStatementOperation = synchronized {
+
+    val operation = new ExecuteStatementOperation(parentSession, statement, confOverlay) {
+      private var result: SchemaRDD = _
+      private var iter: Iterator[SparkRow] = _
+      private var dataTypes: Array[DataType] = _
+
+      def close(): Unit = {
+        // RDDs will be cleaned automatically upon garbage collection.
+        logDebug("CLOSING")
+      }
+
+      def getNextRowSet(order: FetchOrientation, maxRowsL: Long): RowSet = {
+        if (!iter.hasNext) {
+          new RowSet()
+        } else {
+          val maxRows = maxRowsL.toInt // Do you really want a row batch larger than Int Max? No.
+          var curRow = 0
+          var rowSet = new ArrayBuffer[Row](maxRows)
+
+          while (curRow < maxRows && iter.hasNext) {
+            val sparkRow = iter.next()
+            val row = new Row()
+            var curCol = 0
+
+            while (curCol < sparkRow.length) {
+              dataTypes(curCol) match {
+                case StringType =>
+                  row.addString(sparkRow(curCol).asInstanceOf[String])
+                case IntegerType =>
+                  row.addColumnValue(ColumnValue.intValue(sparkRow.getInt(curCol)))
+                case BooleanType =>
+                  row.addColumnValue(ColumnValue.booleanValue(sparkRow.getBoolean(curCol)))
+                case DoubleType =>
+                  row.addColumnValue(ColumnValue.doubleValue(sparkRow.getDouble(curCol)))
+                case FloatType =>
+                  row.addColumnValue(ColumnValue.floatValue(sparkRow.getFloat(curCol)))
+                case DecimalType =>
+                  val hiveDecimal = sparkRow.get(curCol).asInstanceOf[BigDecimal].bigDecimal
+                  row.addColumnValue(ColumnValue.stringValue(new HiveDecimal(hiveDecimal)))
+                case LongType =>
+                  row.addColumnValue(ColumnValue.longValue(sparkRow.getLong(curCol)))
+                case ByteType =>
+                  row.addColumnValue(ColumnValue.byteValue(sparkRow.getByte(curCol)))
+                case ShortType =>
+                  row.addColumnValue(ColumnValue.intValue(sparkRow.getShort(curCol)))
+                case TimestampType =>
+                  row.addColumnValue(
+                    ColumnValue.timestampValue(sparkRow.get(curCol).asInstanceOf[Timestamp]))
+                case BinaryType | _: ArrayType | _: StructType | _: MapType =>
+                  val hiveString = result
+                    .queryExecution
+                    .asInstanceOf[HiveContext#QueryExecution]
+                    .toHiveString((sparkRow.get(curCol), dataTypes(curCol)))
+                  row.addColumnValue(ColumnValue.stringValue(hiveString))
+              }
+              curCol += 1
+            }
+            rowSet += row
+            curRow += 1
+          }
+          new RowSet(rowSet, 0)
+        }
+      }
+
+      def getResultSetSchema: TableSchema = {
+        logWarning(s"Result Schema: ${result.queryExecution.analyzed.output}")
+        if (result.queryExecution.analyzed.output.size == 0) {
+          new TableSchema(new FieldSchema("Result", "string", "") :: Nil)
+        } else {
+          val schema = result.queryExecution.analyzed.output.map { attr =>
+            new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
+          }
+          new TableSchema(schema)
+        }
+      }
+
+      def run(): Unit = {
+        logInfo(s"Running query '$statement'")
+        setState(OperationState.RUNNING)
+        try {
+          result = hiveContext.sql(statement)
+          logDebug(result.queryExecution.toString())
+          val groupId = round(random * 1000000).toString
+          hiveContext.sparkContext.setJobGroup(groupId, statement)
+          iter = result.queryExecution.toRdd.toLocalIterator
+          dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray
+          setHasResultSet(true)
+        } catch {
+          // Actually do need to catch Throwable as some failures don't inherit from Exception and
+          // HiveServer will silently swallow them.
+          case e: Throwable =>
+            logError("Error executing query:",e)
+            throw new HiveSQLException(e.toString)
+        }
+        setState(OperationState.FINISHED)
+      }
+    }
+
+   handleToOperation.put(operation.getHandle, operation)
+   operation
+  }
+}
diff --git a/sql/hive-thriftserver/src/test/resources/data/files/small_kv.txt b/sql/hive-thriftserver/src/test/resources/data/files/small_kv.txt
new file mode 100644
index 0000000000000..850f8014b6f05
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/resources/data/files/small_kv.txt
@@ -0,0 +1,5 @@
+238val_238
+86val_86
+311val_311
+27val_27
+165val_165
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
new file mode 100644
index 0000000000000..69f19f826a802
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import java.io.{BufferedReader, InputStreamReader, PrintWriter}
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+class CliSuite extends FunSuite with BeforeAndAfterAll with TestUtils {
+  val WAREHOUSE_PATH = TestUtils.getWarehousePath("cli")
+  val METASTORE_PATH = TestUtils.getMetastorePath("cli")
+
+  override def beforeAll() {
+    val pb = new ProcessBuilder(
+      "../../bin/spark-sql",
+      "--master",
+      "local",
+      "--hiveconf",
+      s"javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=$METASTORE_PATH;create=true",
+      "--hiveconf",
+      "hive.metastore.warehouse.dir=" + WAREHOUSE_PATH)
+
+    process = pb.start()
+    outputWriter = new PrintWriter(process.getOutputStream, true)
+    inputReader = new BufferedReader(new InputStreamReader(process.getInputStream))
+    errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream))
+    waitForOutput(inputReader, "spark-sql>")
+  }
+
+  override def afterAll() {
+    process.destroy()
+    process.waitFor()
+  }
+
+  test("simple commands") {
+    val dataFilePath = getDataFile("data/files/small_kv.txt")
+    executeQuery("create table hive_test1(key int, val string);")
+    executeQuery("load data local inpath '" + dataFilePath+ "' overwrite into table hive_test1;")
+    executeQuery("cache table hive_test1", "Time taken")
+  }
+}
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
new file mode 100644
index 0000000000000..b7b7c9957ac34
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import scala.collection.JavaConversions._
+import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent._
+
+import java.io.{BufferedReader, InputStreamReader}
+import java.net.ServerSocket
+import java.sql.{Connection, DriverManager, Statement}
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.util.getTempFilePath
+
+/**
+ * Test for the HiveThriftServer2 using JDBC.
+ */
+class HiveThriftServer2Suite extends FunSuite with BeforeAndAfterAll with TestUtils with Logging {
+
+  val WAREHOUSE_PATH = getTempFilePath("warehouse")
+  val METASTORE_PATH = getTempFilePath("metastore")
+
+  val DRIVER_NAME  = "org.apache.hive.jdbc.HiveDriver"
+  val TABLE = "test"
+  val HOST = "localhost"
+  val PORT =  {
+    // Let the system to choose a random available port to avoid collision with other parallel
+    // builds.
+    val socket = new ServerSocket(0)
+    val port = socket.getLocalPort
+    socket.close()
+    port
+  }
+
+  // If verbose is true, the test program will print all outputs coming from the Hive Thrift server.
+  val VERBOSE = Option(System.getenv("SPARK_SQL_TEST_VERBOSE")).getOrElse("false").toBoolean
+
+  Class.forName(DRIVER_NAME)
+
+  override def beforeAll() { launchServer() }
+
+  override def afterAll() { stopServer() }
+
+  private def launchServer(args: Seq[String] = Seq.empty) {
+    // Forking a new process to start the Hive Thrift server. The reason to do this is it is
+    // hard to clean up Hive resources entirely, so we just start a new process and kill
+    // that process for cleanup.
+    val defaultArgs = Seq(
+      "../../sbin/start-thriftserver.sh",
+      "--master local",
+      "--hiveconf",
+      "hive.root.logger=INFO,console",
+      "--hiveconf",
+      s"javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=$METASTORE_PATH;create=true",
+      "--hiveconf",
+      s"hive.metastore.warehouse.dir=$WAREHOUSE_PATH")
+    val pb = new ProcessBuilder(defaultArgs ++ args)
+    val environment = pb.environment()
+    environment.put("HIVE_SERVER2_THRIFT_PORT", PORT.toString)
+    environment.put("HIVE_SERVER2_THRIFT_BIND_HOST", HOST)
+    process = pb.start()
+    inputReader = new BufferedReader(new InputStreamReader(process.getInputStream))
+    errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream))
+    waitForOutput(inputReader, "ThriftBinaryCLIService listening on")
+
+    // Spawn a thread to read the output from the forked process.
+    // Note that this is necessary since in some configurations, log4j could be blocked
+    // if its output to stderr are not read, and eventually blocking the entire test suite.
+    future {
+      while (true) {
+        val stdout = readFrom(inputReader)
+        val stderr = readFrom(errorReader)
+        if (VERBOSE && stdout.length > 0) {
+          println(stdout)
+        }
+        if (VERBOSE && stderr.length > 0) {
+          println(stderr)
+        }
+        Thread.sleep(50)
+      }
+    }
+  }
+
+  private def stopServer() {
+    process.destroy()
+    process.waitFor()
+  }
+
+  test("test query execution against a Hive Thrift server") {
+    Thread.sleep(5 * 1000)
+    val dataFilePath = getDataFile("data/files/small_kv.txt")
+    val stmt = createStatement()
+    stmt.execute("DROP TABLE IF EXISTS test")
+    stmt.execute("DROP TABLE IF EXISTS test_cached")
+    stmt.execute("CREATE TABLE test(key int, val string)")
+    stmt.execute(s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE test")
+    stmt.execute("CREATE TABLE test_cached as select * from test limit 4")
+    stmt.execute("CACHE TABLE test_cached")
+
+    var rs = stmt.executeQuery("select count(*) from test")
+    rs.next()
+    assert(rs.getInt(1) === 5)
+
+    rs = stmt.executeQuery("select count(*) from test_cached")
+    rs.next()
+    assert(rs.getInt(1) === 4)
+
+    stmt.close()
+  }
+
+  def getConnection: Connection = {
+    val connectURI = s"jdbc:hive2://localhost:$PORT/"
+    DriverManager.getConnection(connectURI, System.getProperty("user.name"), "")
+  }
+
+  def createStatement(): Statement = getConnection.createStatement()
+}
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/TestUtils.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/TestUtils.scala
new file mode 100644
index 0000000000000..bb2242618fbef
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/TestUtils.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import java.io.{BufferedReader, PrintWriter}
+import java.text.SimpleDateFormat
+import java.util.Date
+
+import org.apache.hadoop.hive.common.LogUtils
+import org.apache.hadoop.hive.common.LogUtils.LogInitializationException
+
+object TestUtils {
+  val timestamp = new SimpleDateFormat("yyyyMMdd-HHmmss")
+
+  def getWarehousePath(prefix: String): String = {
+    System.getProperty("user.dir") + "/test_warehouses/" + prefix + "-warehouse-" +
+      timestamp.format(new Date)
+  }
+
+  def getMetastorePath(prefix: String): String = {
+    System.getProperty("user.dir") + "/test_warehouses/" + prefix + "-metastore-" +
+      timestamp.format(new Date)
+  }
+
+  // Dummy function for initialize the log4j properties.
+  def init() { }
+
+  // initialize log4j
+  try {
+    LogUtils.initHiveLog4j()
+  } catch {
+    case e: LogInitializationException => // Ignore the error.
+  }
+}
+
+trait TestUtils {
+  var process : Process = null
+  var outputWriter : PrintWriter = null
+  var inputReader : BufferedReader = null
+  var errorReader : BufferedReader = null
+
+  def executeQuery(
+    cmd: String, outputMessage: String = "OK", timeout: Long = 15000): String = {
+    println("Executing: " + cmd + ", expecting output: " + outputMessage)
+    outputWriter.write(cmd + "\n")
+    outputWriter.flush()
+    waitForQuery(timeout, outputMessage)
+  }
+
+  protected def waitForQuery(timeout: Long, message: String): String = {
+    if (waitForOutput(errorReader, message, timeout)) {
+      Thread.sleep(500)
+      readOutput()
+    } else {
+      assert(false, "Didn't find \"" + message + "\" in the output:\n" + readOutput())
+      null
+    }
+  }
+
+  // Wait for the specified str to appear in the output.
+  protected def waitForOutput(
+    reader: BufferedReader, str: String, timeout: Long = 10000): Boolean = {
+    val startTime = System.currentTimeMillis
+    var out = ""
+    while (!out.contains(str) && System.currentTimeMillis < (startTime + timeout)) {
+      out += readFrom(reader)
+    }
+    out.contains(str)
+  }
+
+  // Read stdout output and filter out garbage collection messages.
+  protected def readOutput(): String = {
+    val output = readFrom(inputReader)
+    // Remove GC Messages
+    val filteredOutput = output.lines.filterNot(x => x.contains("[GC") || x.contains("[Full GC"))
+      .mkString("\n")
+    filteredOutput
+  }
+
+  protected def readFrom(reader: BufferedReader): String = {
+    var out = ""
+    var c = 0
+    while (reader.ready) {
+      c = reader.read()
+      out += c.asInstanceOf[Char]
+    }
+    out
+  }
+
+  protected def getDataFile(name: String) = {
+    Thread.currentThread().getContextClassLoader.getResource(name)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
similarity index 94%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
rename to sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index cdfc2d0c17384..4fef071161719 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import java.io.File
+import java.util.TimeZone
 
 import org.scalatest.BeforeAndAfter
 
@@ -31,21 +32,27 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
   lazy val hiveQueryDir = TestHive.getHiveFile("ql" + File.separator + "src" +
     File.separator + "test" + File.separator + "queries" + File.separator + "clientpositive")
 
+  var originalTimeZone: TimeZone = _
+
   def testCases = hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f)
 
   override def beforeAll() {
     TestHive.cacheTables = true
+    // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
+    originalTimeZone = TimeZone.getDefault
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
   }
 
   override def afterAll() {
     TestHive.cacheTables = false
+    TimeZone.setDefault(originalTimeZone)
   }
 
   /** A list of tests deemed out of scope currently and thus completely disregarded. */
   override def blackList = Seq(
     // These tests use hooks that are not on the classpath and thus break all subsequent execution.
     "hook_order",
-    "hook_context",
+    "hook_context_cs",
     "mapjoin_hook",
     "multi_sahooks",
     "overridden_confs",
@@ -84,13 +91,16 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_java_method",
     "create_merge_compressed",
 
+    // DFS commands
+    "symlink_text_input_format",
+
     // Weird DDL differences result in failures on jenkins.
     "create_like2",
     "create_view_translate",
     "partitions_json",
 
-    // Timezone specific test answers.
-    "udf_unix_timestamp",
+    // This test is totally fine except that it includes wrong queries and expects errors, but error
+    // message format in Hive and Spark SQL differ. Should workaround this later.
     "udf_to_unix_timestamp",
 
     // Cant run without local map/reduce.
@@ -186,7 +196,10 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
 
     // Hive returns the results of describe as plain text. Comments with multiple lines
     // introduce extra lines in the Hive results, which make the result comparison fail.
-    "describe_comment_indent"
+    "describe_comment_indent",
+
+    // Limit clause without a ordering, which causes failure.
+    "orc_predicate_pushdown"
   )
 
   /**
@@ -276,9 +289,12 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "compute_stats_empty_table",
     "compute_stats_long",
     "compute_stats_string",
-    "compute_stats_table",
     "convert_enum_to_string",
+    "correlationoptimizer1",
+    "correlationoptimizer10",
     "correlationoptimizer11",
+    "correlationoptimizer13",
+    "correlationoptimizer14",
     "correlationoptimizer15",
     "correlationoptimizer2",
     "correlationoptimizer3",
@@ -286,6 +302,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "correlationoptimizer6",
     "correlationoptimizer7",
     "correlationoptimizer8",
+    "correlationoptimizer9",
     "count",
     "cp_mj_rc",
     "create_insert_outputformat",
@@ -296,6 +313,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "ct_case_insensitive",
     "database_location",
     "database_properties",
+    "decimal_1",
     "decimal_4",
     "decimal_join",
     "default_partition_name",
@@ -304,6 +322,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "describe_formatted_view_partitioned",
     "diff_part_input_formats",
     "disable_file_format_check",
+    "disallow_incompatible_type_change_off",
     "drop_function",
     "drop_index",
     "drop_multi_partitions",
@@ -359,8 +378,10 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "groupby_map_ppr",
     "groupby_multi_insert_common_distinct",
     "groupby_multi_single_reducer2",
+    "groupby_multi_single_reducer3",
     "groupby_mutli_insert_common_distinct",
     "groupby_neg_float",
+    "groupby_ppd",
     "groupby_ppr",
     "groupby_sort_10",
     "groupby_sort_2",
@@ -372,6 +393,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "groupby_sort_8",
     "groupby_sort_9",
     "groupby_sort_test_1",
+    "having",
     "implicit_cast1",
     "innerjoin",
     "inoutdriver",
@@ -400,6 +422,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "input4",
     "input40",
     "input41",
+    "input49",
     "input4_cb_delim",
     "input6",
     "input7",
@@ -481,6 +504,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "join_hive_626",
     "join_map_ppr",
     "join_nulls",
+    "join_nullsafe",
     "join_rc",
     "join_reorder2",
     "join_reorder3",
@@ -491,6 +515,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "lateral_view_ppd",
     "leftsemijoin",
     "leftsemijoin_mr",
+    "limit_pushdown_negative",
     "lineage1",
     "literal_double",
     "literal_ints",
@@ -598,6 +623,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "reduce_deduplicate",
     "reduce_deduplicate_exclude_gby",
     "reduce_deduplicate_exclude_join",
+    "reduce_deduplicate_extended",
     "reducesink_dedup",
     "rename_column",
     "router_join_ppr",
@@ -646,7 +672,13 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "stats_publisher_error_1",
     "subq2",
     "tablename_with_select",
+    "timestamp_1",
+    "timestamp_2",
+    "timestamp_3",
     "timestamp_comparison",
+    "timestamp_lazy",
+    "timestamp_null",
+    "timestamp_udf",
     "touch",
     "transform_ppr1",
     "transform_ppr2",
@@ -663,8 +695,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf7",
     "udf8",
     "udf9",
-    "udf_E",
-    "udf_PI",
     "udf_abs",
     "udf_acos",
     "udf_add",
@@ -704,6 +734,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_double",
     "udf_E",
     "udf_elt",
+    "udf_equal",
     "udf_exp",
     "udf_field",
     "udf_find_in_set",
@@ -786,6 +817,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_translate",
     "udf_trim",
     "udf_ucase",
+    "udf_unix_timestamp",
     "udf_upper",
     "udf_var_pop",
     "udf_var_samp",
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 5ede76e5c3904..93d00f7c37c9b 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -31,6 +31,9 @@
   <packaging>jar</packaging>
   <name>Spark Project Hive</name>
   <url>http://spark.apache.org/</url>
+  <properties>
+    <sbt.project.name>hive</sbt.project.name>
+  </properties>
 
   <dependencies>
     <dependency>
@@ -48,6 +51,11 @@
       <artifactId>hive-metastore</artifactId>
       <version>${hive.version}</version>
     </dependency>
+    <dependency>
+      <groupId>commons-httpclient</groupId>
+      <artifactId>commons-httpclient</artifactId>
+      <version>3.1</version>
+    </dependency>
     <dependency>
       <groupId>org.spark-project.hive</groupId>
       <artifactId>hive-exec</artifactId>
@@ -94,6 +102,36 @@
       <scope>test</scope>
     </dependency>
   </dependencies>
+
+  <profiles>
+    <profile>
+      <id>hive</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>build-helper-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <id>add-scala-test-sources</id>
+                <phase>generate-test-sources</phase>
+                <goals>
+                  <goal>add-test-source</goal>
+                </goals>
+                <configuration>
+                  <sources>
+                    <source>src/test/scala</source>
+                    <source>compatibility/src/test/scala</source>
+                  </sources>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
+
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 7aedfcd74189b..d8e7a5943daa5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -18,32 +18,41 @@
 package org.apache.spark.sql.hive
 
 import java.io.{BufferedReader, File, InputStreamReader, PrintStream}
+import java.sql.Timestamp
 import java.util.{ArrayList => JArrayList}
 
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
-import scala.reflect.runtime.universe.TypeTag
+import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
+import org.apache.hadoop.fs.FileSystem
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst
+import org.apache.hadoop.hive.serde2.io.TimestampWritable
 
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.analysis.{Analyzer, OverrideCatalog}
+import org.apache.spark.sql.catalyst.analysis.{OverrideFunctionRegistry, Analyzer, OverrideCatalog}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.sql.execution.ExtractPythonUdfs
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.execution.{Command => PhysicalCommand}
 import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
 
 /**
- * Starts up an instance of hive where metadata is stored locally. An in-process metadata data is
- * created with data stored in ./metadata.  Warehouse data is stored in in ./warehouse.
+ * DEPRECATED: Use HiveContext instead.
  */
+@deprecated("""
+  Use HiveContext instead.  It will still create a local metastore if one is not specified.
+  However, note that the default directory is ./metastore_db, not ./metastore
+  """, "1.1")
 class LocalHiveContext(sc: SparkContext) extends HiveContext(sc) {
 
   lazy val metastorePath = new File("metastore").getCanonicalPath
@@ -66,15 +75,29 @@ class LocalHiveContext(sc: SparkContext) extends HiveContext(sc) {
 class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   self =>
 
+  // Change the default SQL dialect to HiveQL
+  override private[spark] def dialect: String = get(SQLConf.DIALECT, "hiveql")
+
   override protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution { val logical = plan }
 
-  /**
-   * Executes a query expressed in HiveQL using Spark, returning the result as a SchemaRDD.
-   */
+  override def sql(sqlText: String): SchemaRDD = {
+    // TODO: Create a framework for registering parsers instead of just hardcoding if statements.
+    if (dialect == "sql") {
+      super.sql(sqlText)
+    } else if (dialect == "hiveql") {
+      new SchemaRDD(this, HiveQl.parseSql(sqlText))
+    }  else {
+      sys.error(s"Unsupported SQL dialect: $dialect.  Try 'sql' or 'hiveql'")
+    }
+  }
+
+  @deprecated("hiveql() is deprecated as the sql function now parses using HiveQL by default. " +
+             s"The SQL dialect for parsing can be set using ${SQLConf.DIALECT}", "1.1")
   def hiveql(hqlQuery: String): SchemaRDD = new SchemaRDD(this, HiveQl.parseSql(hqlQuery))
 
-  /** An alias for `hiveql`. */
+  @deprecated("hql() is deprecated as the sql function now parses using HiveQL by default. " +
+             s"The SQL dialect for parsing can be set using ${SQLConf.DIALECT}", "1.1")
   def hql(hqlQuery: String): SchemaRDD = hiveql(hqlQuery)
 
   /**
@@ -88,9 +111,84 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     catalog.createTable("default", tableName, ScalaReflection.attributesFor[A], allowExisting)
   }
 
+  /**
+   * Analyzes the given table in the current database to generate statistics, which will be
+   * used in query optimizations.
+   *
+   * Right now, it only supports Hive tables and it only updates the size of a Hive table
+   * in the Hive metastore.
+   */
+  def analyze(tableName: String) {
+    val relation = catalog.lookupRelation(None, tableName) match {
+      case LowerCaseSchema(r) => r
+      case o => o
+    }
+
+    relation match {
+      case relation: MetastoreRelation => {
+        // This method is mainly based on
+        // org.apache.hadoop.hive.ql.stats.StatsUtils.getFileSizeForTable(HiveConf, Table)
+        // in Hive 0.13 (except that we do not use fs.getContentSummary).
+        // TODO: Generalize statistics collection.
+        // TODO: Why fs.getContentSummary returns wrong size on Jenkins?
+        // Can we use fs.getContentSummary in future?
+        // Seems fs.getContentSummary returns wrong table size on Jenkins. So we use
+        // countFileSize to count the table size.
+        def calculateTableSize(fs: FileSystem, path: Path): Long = {
+          val fileStatus = fs.getFileStatus(path)
+          val size = if (fileStatus.isDir) {
+            fs.listStatus(path).map(status => calculateTableSize(fs, status.getPath)).sum
+          } else {
+            fileStatus.getLen
+          }
+
+          size
+        }
+
+        def getFileSizeForTable(conf: HiveConf, table: Table): Long = {
+          val path = table.getPath()
+          var size: Long = 0L
+          try {
+            val fs = path.getFileSystem(conf)
+            size = calculateTableSize(fs, path)
+          } catch {
+            case e: Exception =>
+              logWarning(
+                s"Failed to get the size of table ${table.getTableName} in the " +
+                s"database ${table.getDbName} because of ${e.toString}", e)
+              size = 0L
+          }
+
+          size
+        }
+
+        val tableParameters = relation.hiveQlTable.getParameters
+        val oldTotalSize =
+          Option(tableParameters.get(StatsSetupConst.TOTAL_SIZE)).map(_.toLong).getOrElse(0L)
+        val newTotalSize = getFileSizeForTable(hiveconf, relation.hiveQlTable)
+        // Update the Hive metastore if the total size of the table is different than the size
+        // recorded in the Hive metastore.
+        // This logic is based on org.apache.hadoop.hive.ql.exec.StatsTask.aggregateStats().
+        if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
+          tableParameters.put(StatsSetupConst.TOTAL_SIZE, newTotalSize.toString)
+          val hiveTTable = relation.hiveQlTable.getTTable
+          hiveTTable.setParameters(tableParameters)
+          val tableFullName =
+            relation.hiveQlTable.getDbName() + "." + relation.hiveQlTable.getTableName()
+
+          catalog.client.alterTable(tableFullName, new Table(hiveTTable))
+        }
+      }
+      case otherRelation =>
+        throw new NotImplementedError(
+          s"Analyze has only implemented for Hive tables, " +
+            s"but ${tableName} is a ${otherRelation.nodeName}")
+    }
+  }
+
   // Circular buffer to hold what hive prints to STDOUT and ERR.  Only printed when failures occur.
   @transient
-  protected val outputBuffer =  new java.io.OutputStream {
+  protected lazy val outputBuffer =  new java.io.OutputStream {
     var pos: Int = 0
     var buffer = new Array[Int](10240)
     def write(i: Int): Unit = {
@@ -120,7 +218,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   /**
    * SQLConf and HiveConf contracts: when the hive session is first initialized, params in
    * HiveConf will get picked up by the SQLConf.  Additionally, any properties set by
-   * set() or a SET command inside hql() or sql() will be set in the SQLConf *as well as*
+   * set() or a SET command inside sql() will be set in the SQLConf *as well as*
    * in the HiveConf.
    */
   @transient protected[hive] lazy val hiveconf = new HiveConf(classOf[SessionState])
@@ -150,10 +248,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     }
   }
 
+  // Note that HiveUDFs will be overridden by functions registered in this context.
+  override protected[sql] lazy val functionRegistry =
+    new HiveFunctionRegistry with OverrideFunctionRegistry
+
   /* An analyzer that uses the Hive metastore. */
   @transient
   override protected[sql] lazy val analyzer =
-    new Analyzer(catalog, HiveFunctionRegistry, caseSensitive = false)
+    new Analyzer(catalog, functionRegistry, caseSensitive = false)
 
   /**
    * Runs the specified SQL query using Hive.
@@ -202,7 +304,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       }
     } catch {
       case e: Exception =>
-        logger.error(
+        logError(
           s"""
             |======================
             |HIVE FAILURE OUTPUT
@@ -229,7 +331,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       HiveTableScans,
       DataSinks,
       Scripts,
-      PartialAggregation,
+      HashAggregation,
       LeftSemiJoin,
       HashJoin,
       BasicOperators,
@@ -245,27 +347,29 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   protected[sql] abstract class QueryExecution extends super.QueryExecution {
     // TODO: Create mixin for the analyzer instead of overriding things here.
     override lazy val optimizedPlan =
-      optimizer(catalog.PreInsertionCasts(catalog.CreateTables(analyzed)))
+      optimizer(ExtractPythonUdfs(catalog.PreInsertionCasts(catalog.CreateTables(analyzed))))
 
     override lazy val toRdd: RDD[Row] = executedPlan.execute().map(_.copy())
 
     protected val primitiveTypes =
       Seq(StringType, IntegerType, LongType, DoubleType, FloatType, BooleanType, ByteType,
-        ShortType, DecimalType, TimestampType)
+        ShortType, DecimalType, TimestampType, BinaryType)
 
-    protected def toHiveString(a: (Any, DataType)): String = a match {
+    protected[sql] def toHiveString(a: (Any, DataType)): String = a match {
       case (struct: Row, StructType(fields)) =>
         struct.zip(fields).map {
           case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
         }.mkString("{", ",", "}")
-      case (seq: Seq[_], ArrayType(typ)) =>
+      case (seq: Seq[_], ArrayType(typ, _)) =>
         seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
-      case (map: Map[_,_], MapType(kType, vType)) =>
+      case (map: Map[_,_], MapType(kType, vType, _)) =>
         map.map {
           case (key, value) =>
             toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
         }.toSeq.sorted.mkString("{", ",", "}")
       case (null, _) => "NULL"
+      case (t: Timestamp, TimestampType) => new TimestampWritable(t).toString
+      case (bin: Array[Byte], BinaryType) => new String(bin, "UTF-8")
       case (other, tpe) if primitiveTypes contains tpe => other.toString
     }
 
@@ -275,9 +379,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         struct.zip(fields).map {
           case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
         }.mkString("{", ",", "}")
-      case (seq: Seq[_], ArrayType(typ)) =>
+      case (seq: Seq[_], ArrayType(typ, _)) =>
         seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
-      case (map: Map[_,_], MapType(kType, vType)) =>
+      case (map: Map[_,_], MapType(kType, vType, _)) =>
         map.map {
           case (key, value) =>
             toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
new file mode 100644
index 0000000000000..354fcd53f303b
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
+import org.apache.hadoop.hive.serde2.objectinspector._
+import org.apache.hadoop.hive.serde2.objectinspector.primitive._
+import org.apache.hadoop.hive.serde2.{io => hiveIo}
+import org.apache.hadoop.{io => hadoopIo}
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.types
+import org.apache.spark.sql.catalyst.types._
+
+/* Implicit conversions */
+import scala.collection.JavaConversions._
+
+private[hive] trait HiveInspectors {
+
+  def javaClassToDataType(clz: Class[_]): DataType = clz match {
+    // writable
+    case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType
+    case c: Class[_] if c == classOf[hiveIo.DoubleWritable] => DoubleType
+    case c: Class[_] if c == classOf[hiveIo.HiveDecimalWritable] => DecimalType
+    case c: Class[_] if c == classOf[hiveIo.ByteWritable] => ByteType
+    case c: Class[_] if c == classOf[hiveIo.ShortWritable] => ShortType
+    case c: Class[_] if c == classOf[hiveIo.TimestampWritable] => TimestampType
+    case c: Class[_] if c == classOf[hadoopIo.Text] => StringType
+    case c: Class[_] if c == classOf[hadoopIo.IntWritable] => IntegerType
+    case c: Class[_] if c == classOf[hadoopIo.LongWritable] => LongType
+    case c: Class[_] if c == classOf[hadoopIo.FloatWritable] => FloatType
+    case c: Class[_] if c == classOf[hadoopIo.BooleanWritable] => BooleanType
+    case c: Class[_] if c == classOf[hadoopIo.BytesWritable] => BinaryType
+
+    // java class
+    case c: Class[_] if c == classOf[java.lang.String] => StringType
+    case c: Class[_] if c == classOf[java.sql.Timestamp] => TimestampType
+    case c: Class[_] if c == classOf[HiveDecimal] => DecimalType
+    case c: Class[_] if c == classOf[java.math.BigDecimal] => DecimalType
+    case c: Class[_] if c == classOf[Array[Byte]] => BinaryType
+    case c: Class[_] if c == classOf[java.lang.Short] => ShortType
+    case c: Class[_] if c == classOf[java.lang.Integer] => IntegerType
+    case c: Class[_] if c == classOf[java.lang.Long] => LongType
+    case c: Class[_] if c == classOf[java.lang.Double] => DoubleType
+    case c: Class[_] if c == classOf[java.lang.Byte] => ByteType
+    case c: Class[_] if c == classOf[java.lang.Float] => FloatType
+    case c: Class[_] if c == classOf[java.lang.Boolean] => BooleanType
+
+    // primitive type
+    case c: Class[_] if c == java.lang.Short.TYPE => ShortType
+    case c: Class[_] if c == java.lang.Integer.TYPE => IntegerType
+    case c: Class[_] if c == java.lang.Long.TYPE => LongType
+    case c: Class[_] if c == java.lang.Double.TYPE => DoubleType
+    case c: Class[_] if c == java.lang.Byte.TYPE => ByteType
+    case c: Class[_] if c == java.lang.Float.TYPE => FloatType
+    case c: Class[_] if c == java.lang.Boolean.TYPE => BooleanType
+
+    case c: Class[_] if c.isArray => ArrayType(javaClassToDataType(c.getComponentType))
+  }
+
+  /** Converts hive types to native catalyst types. */
+  def unwrap(a: Any): Any = a match {
+    case null => null
+    case i: hadoopIo.IntWritable => i.get
+    case t: hadoopIo.Text => t.toString
+    case l: hadoopIo.LongWritable => l.get
+    case d: hadoopIo.DoubleWritable => d.get
+    case d: hiveIo.DoubleWritable => d.get
+    case s: hiveIo.ShortWritable => s.get
+    case b: hadoopIo.BooleanWritable => b.get
+    case b: hiveIo.ByteWritable => b.get
+    case b: hadoopIo.FloatWritable => b.get
+    case b: hadoopIo.BytesWritable => {
+      val bytes = new Array[Byte](b.getLength)
+      System.arraycopy(b.getBytes(), 0, bytes, 0, b.getLength)
+      bytes
+    }
+    case t: hiveIo.TimestampWritable => t.getTimestamp
+    case b: hiveIo.HiveDecimalWritable => BigDecimal(b.getHiveDecimal().bigDecimalValue())
+    case list: java.util.List[_] => list.map(unwrap)
+    case map: java.util.Map[_,_] => map.map { case (k, v) => (unwrap(k), unwrap(v)) }.toMap
+    case array: Array[_] => array.map(unwrap).toSeq
+    case p: java.lang.Short => p
+    case p: java.lang.Long => p
+    case p: java.lang.Float => p
+    case p: java.lang.Integer => p
+    case p: java.lang.Double => p
+    case p: java.lang.Byte => p
+    case p: java.lang.Boolean => p
+    case str: String => str
+    case p: java.math.BigDecimal => p
+    case p: Array[Byte] => p
+    case p: java.sql.Timestamp => p
+  }
+
+  def unwrapData(data: Any, oi: ObjectInspector): Any = oi match {
+    case hvoi: HiveVarcharObjectInspector =>
+      if (data == null) null else hvoi.getPrimitiveJavaObject(data).getValue
+    case hdoi: HiveDecimalObjectInspector =>
+      if (data == null) null else BigDecimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue())
+    case pi: PrimitiveObjectInspector => pi.getPrimitiveJavaObject(data)
+    case li: ListObjectInspector =>
+      Option(li.getList(data))
+        .map(_.map(unwrapData(_, li.getListElementObjectInspector)).toSeq)
+        .orNull
+    case mi: MapObjectInspector =>
+      Option(mi.getMap(data)).map(
+        _.map {
+          case (k,v) =>
+            (unwrapData(k, mi.getMapKeyObjectInspector),
+              unwrapData(v, mi.getMapValueObjectInspector))
+        }.toMap).orNull
+    case si: StructObjectInspector =>
+      val allRefs = si.getAllStructFieldRefs
+      new GenericRow(
+        allRefs.map(r =>
+          unwrapData(si.getStructFieldData(data,r), r.getFieldObjectInspector)).toArray)
+  }
+
+  /** Converts native catalyst types to the types expected by Hive */
+  def wrap(a: Any): AnyRef = a match {
+    case s: String => new hadoopIo.Text(s) // TODO why should be Text?
+    case i: Int => i: java.lang.Integer
+    case b: Boolean => b: java.lang.Boolean
+    case f: Float => f: java.lang.Float
+    case d: Double => d: java.lang.Double
+    case l: Long => l: java.lang.Long
+    case l: Short => l: java.lang.Short
+    case l: Byte => l: java.lang.Byte
+    case b: BigDecimal => b.bigDecimal
+    case b: Array[Byte] => b
+    case t: java.sql.Timestamp => t
+    case s: Seq[_] => seqAsJavaList(s.map(wrap))
+    case m: Map[_,_] =>
+      mapAsJavaMap(m.map { case (k, v) => wrap(k) -> wrap(v) })
+    case null => null
+  }
+
+  def toInspector(dataType: DataType): ObjectInspector = dataType match {
+    case ArrayType(tpe, _) =>
+      ObjectInspectorFactory.getStandardListObjectInspector(toInspector(tpe))
+    case MapType(keyType, valueType, _) =>
+      ObjectInspectorFactory.getStandardMapObjectInspector(
+        toInspector(keyType), toInspector(valueType))
+    case StringType => PrimitiveObjectInspectorFactory.javaStringObjectInspector
+    case IntegerType => PrimitiveObjectInspectorFactory.javaIntObjectInspector
+    case DoubleType => PrimitiveObjectInspectorFactory.javaDoubleObjectInspector
+    case BooleanType => PrimitiveObjectInspectorFactory.javaBooleanObjectInspector
+    case LongType => PrimitiveObjectInspectorFactory.javaLongObjectInspector
+    case FloatType => PrimitiveObjectInspectorFactory.javaFloatObjectInspector
+    case ShortType => PrimitiveObjectInspectorFactory.javaShortObjectInspector
+    case ByteType => PrimitiveObjectInspectorFactory.javaByteObjectInspector
+    case NullType => PrimitiveObjectInspectorFactory.javaVoidObjectInspector
+    case BinaryType => PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector
+    case TimestampType => PrimitiveObjectInspectorFactory.javaTimestampObjectInspector
+    case DecimalType => PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector
+    case StructType(fields) =>
+      ObjectInspectorFactory.getStandardStructObjectInspector(
+        fields.map(f => f.name), fields.map(f => toInspector(f.dataType)))
+  }
+
+  def inspectorToDataType(inspector: ObjectInspector): DataType = inspector match {
+    case s: StructObjectInspector =>
+      StructType(s.getAllStructFieldRefs.map(f => {
+        types.StructField(
+          f.getFieldName, inspectorToDataType(f.getFieldObjectInspector), nullable = true)
+      }))
+    case l: ListObjectInspector => ArrayType(inspectorToDataType(l.getListElementObjectInspector))
+    case m: MapObjectInspector =>
+      MapType(
+        inspectorToDataType(m.getMapKeyObjectInspector),
+        inspectorToDataType(m.getMapValueObjectInspector))
+    case _: WritableStringObjectInspector => StringType
+    case _: JavaStringObjectInspector => StringType
+    case _: WritableIntObjectInspector => IntegerType
+    case _: JavaIntObjectInspector => IntegerType
+    case _: WritableDoubleObjectInspector => DoubleType
+    case _: JavaDoubleObjectInspector => DoubleType
+    case _: WritableBooleanObjectInspector => BooleanType
+    case _: JavaBooleanObjectInspector => BooleanType
+    case _: WritableLongObjectInspector => LongType
+    case _: JavaLongObjectInspector => LongType
+    case _: WritableShortObjectInspector => ShortType
+    case _: JavaShortObjectInspector => ShortType
+    case _: WritableByteObjectInspector => ByteType
+    case _: JavaByteObjectInspector => ByteType
+    case _: WritableFloatObjectInspector => FloatType
+    case _: JavaFloatObjectInspector => FloatType
+    case _: WritableBinaryObjectInspector => BinaryType
+    case _: JavaBinaryObjectInspector => BinaryType
+    case _: WritableHiveDecimalObjectInspector => DecimalType
+    case _: JavaHiveDecimalObjectInspector => DecimalType
+    case _: WritableTimestampObjectInspector => TimestampType
+    case _: JavaTimestampObjectInspector => TimestampType
+  }
+
+  implicit class typeInfoConversions(dt: DataType) {
+    import org.apache.hadoop.hive.serde2.typeinfo._
+    import TypeInfoFactory._
+
+    def toTypeInfo: TypeInfo = dt match {
+      case BinaryType => binaryTypeInfo
+      case BooleanType => booleanTypeInfo
+      case ByteType => byteTypeInfo
+      case DoubleType => doubleTypeInfo
+      case FloatType => floatTypeInfo
+      case IntegerType => intTypeInfo
+      case LongType => longTypeInfo
+      case ShortType => shortTypeInfo
+      case StringType => stringTypeInfo
+      case DecimalType => decimalTypeInfo
+      case TimestampType => timestampTypeInfo
+      case NullType => voidTypeInfo
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 7c24b5cabf614..301cf51c00e2b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -19,15 +19,18 @@ package org.apache.spark.sql.hive
 
 import scala.util.parsing.combinator.RegexParsers
 
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.{FieldSchema, StorageDescriptor, SerDeInfo}
 import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition}
 import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
 import org.apache.hadoop.hive.ql.plan.TableDesc
-import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst
 import org.apache.hadoop.hive.serde2.Deserializer
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.analysis.{EliminateAnalysisOperators, Catalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
@@ -43,14 +46,18 @@ import scala.collection.JavaConversions._
 private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with Logging {
   import HiveMetastoreTypes._
 
-  val client = Hive.get(hive.hiveconf)
+  /** Connection to hive metastore.  Usages should lock on `this`. */
+  protected[hive] val client = Hive.get(hive.hiveconf)
+
+  val caseSensitive: Boolean = false
 
   def lookupRelation(
       db: Option[String],
       tableName: String,
-      alias: Option[String]): LogicalPlan = {
-    val databaseName = db.getOrElse(hive.sessionState.getCurrentDatabase)
-    val table = client.getTable(databaseName, tableName)
+      alias: Option[String]): LogicalPlan = synchronized {
+    val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
+    val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
+    val table = client.getTable(databaseName, tblName)
     val partitions: Seq[Partition] =
       if (table.isPartitioned) {
         client.getAllPartitionsForPruner(table).toSeq
@@ -60,9 +67,8 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
 
     // Since HiveQL is case insensitive for table names we make them all lowercase.
     MetastoreRelation(
-      databaseName.toLowerCase,
-      tableName.toLowerCase,
-      alias)(table.getTTable, partitions.map(part => part.getTPartition))
+      databaseName, tblName, alias)(
+      table.getTTable, partitions.map(part => part.getTPartition))(hive)
   }
 
   def createTable(
@@ -70,7 +76,8 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
       tableName: String,
       schema: Seq[Attribute],
       allowExisting: Boolean = false): Unit = {
-    val table = new Table(databaseName, tableName)
+    val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName)
+    val table = new Table(dbName, tblName)
     val hiveSchema =
       schema.map(attr => new FieldSchema(attr.name, toMetastoreType(attr.dataType), ""))
     table.setFields(hiveSchema)
@@ -85,7 +92,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     sd.setInputFormat("org.apache.hadoop.mapred.TextInputFormat")
     sd.setOutputFormat("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")
     val serDeInfo = new SerDeInfo()
-    serDeInfo.setName(tableName)
+    serDeInfo.setName(tblName)
     serDeInfo.setSerializationLib("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
     serDeInfo.setParameters(Map[String, String]())
     sd.setSerdeInfo(serDeInfo)
@@ -104,13 +111,14 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
   object CreateTables extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case InsertIntoCreatedTable(db, tableName, child) =>
-        val databaseName = db.getOrElse(hive.sessionState.getCurrentDatabase)
+        val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
+        val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
 
-        createTable(databaseName, tableName, child.output)
+        createTable(databaseName, tblName, child.output)
 
         InsertIntoTable(
           EliminateAnalysisOperators(
-            lookupRelation(Some(databaseName), tableName, None)),
+            lookupRelation(Some(databaseName), tblName, None)),
           Map.empty,
           child,
           overwrite = false)
@@ -194,7 +202,9 @@ object HiveMetastoreTypes extends RegexParsers {
     "varchar\\((\\d+)\\)".r ^^^ StringType
 
   protected lazy val arrayType: Parser[DataType] =
-    "array" ~> "<" ~> dataType <~ ">" ^^ ArrayType
+    "array" ~> "<" ~> dataType <~ ">" ^^ {
+      case tpe => ArrayType(tpe)
+    }
 
   protected lazy val mapType: Parser[DataType] =
     "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
@@ -223,10 +233,10 @@ object HiveMetastoreTypes extends RegexParsers {
   }
 
   def toMetastoreType(dt: DataType): String = dt match {
-    case ArrayType(elementType) => s"array<${toMetastoreType(elementType)}>"
+    case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
     case StructType(fields) =>
       s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"
-    case MapType(keyType, valueType) =>
+    case MapType(keyType, valueType, _) =>
       s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>"
     case StringType => "string"
     case FloatType => "float"
@@ -245,19 +255,38 @@ object HiveMetastoreTypes extends RegexParsers {
 private[hive] case class MetastoreRelation
     (databaseName: String, tableName: String, alias: Option[String])
     (val table: TTable, val partitions: Seq[TPartition])
-  extends BaseRelation {
+    (@transient sqlContext: SQLContext)
+  extends LeafNode {
+
+  self: Product =>
+
   // TODO: Can we use org.apache.hadoop.hive.ql.metadata.Table as the type of table and
   // use org.apache.hadoop.hive.ql.metadata.Partition as the type of elements of partitions.
   // Right now, using org.apache.hadoop.hive.ql.metadata.Table and
   // org.apache.hadoop.hive.ql.metadata.Partition will cause a NotSerializableException
   // which indicates the SerDe we used is not Serializable.
 
-  def hiveQlTable = new Table(table)
+  @transient lazy val hiveQlTable = new Table(table)
 
   def hiveQlPartitions = partitions.map { p =>
     new Partition(hiveQlTable, p)
   }
 
+  @transient override lazy val statistics = Statistics(
+    sizeInBytes = {
+      // TODO: check if this estimate is valid for tables after partition pruning.
+      // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
+      // relatively cheap if parameters for the table are populated into the metastore.  An
+      // alternative would be going through Hadoop's FileSystem API, which can be expensive if a lot
+      // of RPCs are involved.  Besides `totalSize`, there are also `numFiles`, `numRows`,
+      // `rawDataSize` keys (see StatsSetupConst in Hive) that we can look at in the future.
+      BigInt(
+        Option(hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE))
+          .map(_.toLong)
+          .getOrElse(sqlContext.defaultSizeInBytes))
+    }
+  )
+
   val tableDesc = new TableDesc(
     Class.forName(hiveQlTable.getSerializationLib).asInstanceOf[Class[Deserializer]],
     hiveQlTable.getInputFormatClass,
@@ -269,14 +298,14 @@ private[hive] case class MetastoreRelation
     hiveQlTable.getMetadata
   )
 
-   implicit class SchemaAttribute(f: FieldSchema) {
-     def toAttribute = AttributeReference(
-       f.getName,
-       HiveMetastoreTypes.toDataType(f.getType),
-       // Since data can be dumped in randomly with no validation, everything is nullable.
-       nullable = true
-     )(qualifiers = tableName +: alias.toSeq)
-   }
+  implicit class SchemaAttribute(f: FieldSchema) {
+    def toAttribute = AttributeReference(
+      f.getName,
+      HiveMetastoreTypes.toDataType(f.getType),
+      // Since data can be dumped in randomly with no validation, everything is nullable.
+      nullable = true
+    )(qualifiers = tableName +: alias.toSeq)
+  }
 
   // Must be a stable value since new attributes are born here.
   val partitionKeys = hiveQlTable.getPartitionKeys.map(_.toAttribute)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index b70104dd5be5a..bc2fefafd58c8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -42,10 +42,10 @@ private[hive] case class ShellCommand(cmd: String) extends Command
 
 private[hive] case class SourceCommand(filePath: String) extends Command
 
-private[hive] case class AddJar(jarPath: String) extends Command
-
 private[hive] case class AddFile(filePath: String) extends Command
 
+private[hive] case class DropTable(tableName: String, ifExists: Boolean) extends Command
+
 /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
 private[hive] object HiveQl {
   protected val nativeCommands = Seq(
@@ -98,7 +98,6 @@ private[hive] object HiveQl {
     "TOK_CREATEINDEX",
     "TOK_DROPDATABASE",
     "TOK_DROPINDEX",
-    "TOK_DROPTABLE",
     "TOK_MSCK",
 
     // TODO(marmbrus): Figure out how view are expanded by hive, as we might need to handle this.
@@ -229,7 +228,7 @@ private[hive] object HiveQl {
       } else if (sql.trim.toLowerCase.startsWith("uncache table")) {
         CacheCommand(sql.trim.drop(14).trim, false)
       } else if (sql.trim.toLowerCase.startsWith("add jar")) {
-        AddJar(sql.trim.drop(8))
+        NativeCommand(sql)
       } else if (sql.trim.toLowerCase.startsWith("add file")) {
         AddFile(sql.trim.drop(9))
       } else if (sql.trim.toLowerCase.startsWith("dfs")) {
@@ -298,8 +297,11 @@ private[hive] object HiveQl {
       matches.headOption
     }
 
-    assert(remainingNodes.isEmpty,
-      s"Unhandled clauses: ${remainingNodes.map(dumpTree(_)).mkString("\n")}")
+    if (remainingNodes.nonEmpty) {
+      sys.error(
+        s"""Unhandled clauses: ${remainingNodes.map(dumpTree(_)).mkString("\n")}.
+           |You are likely trying to use an unsupported Hive feature."""".stripMargin)
+    }
     clauses
   }
 
@@ -379,6 +381,12 @@ private[hive] object HiveQl {
   }
 
   protected def nodeToPlan(node: Node): LogicalPlan = node match {
+    // Special drop table that also uncaches.
+    case Token("TOK_DROPTABLE",
+           Token("TOK_TABNAME", tableNameParts) ::
+           ifExists) =>
+      val tableName = tableNameParts.map { case Token(p, Nil) => p }.mkString(".")
+      DropTable(tableName, ifExists.nonEmpty)
     // Just fake explain for any of the native commands.
     case Token("TOK_EXPLAIN", explainArgs)
       if noExplainCommands.contains(explainArgs.head.getText) =>
@@ -612,7 +620,7 @@ private[hive] object HiveQl {
         // TOK_DESTINATION means to overwrite the table.
         val resultDestination =
           (intoClause orElse destClause).getOrElse(sys.error("No destination found."))
-        val overwrite = if (intoClause.isEmpty) true else false
+        val overwrite = intoClause.isEmpty
         nodeToDest(
           resultDestination,
           withLimit,
@@ -743,7 +751,10 @@ private[hive] object HiveQl {
     case Token(allJoinTokens(joinToken),
            relation1 ::
            relation2 :: other) =>
-      assert(other.size <= 1, s"Unhandled join child $other")
+      if (!(other.size <= 1)) {
+        sys.error(s"Unsupported join operation: $other")
+      }
+
       val joinType = joinToken match {
         case "TOK_JOIN" => Inner
         case "TOK_RIGHTOUTERJOIN" => RightOuter
@@ -751,7 +762,6 @@ private[hive] object HiveQl {
         case "TOK_FULLOUTERJOIN" => FullOuter
         case "TOK_LEFTSEMIJOIN" => LeftSemi
       }
-      assert(other.size <= 1, "Unhandled join clauses.")
       Join(nodeToRelation(relation1),
         nodeToRelation(relation2),
         joinType,
@@ -860,6 +870,7 @@ private[hive] object HiveQl {
   val BETWEEN = "(?i)BETWEEN".r
   val WHEN = "(?i)WHEN".r
   val CASE = "(?i)CASE".r
+  val SUBSTR = "(?i)SUBSTR(?:ING)?".r
 
   protected def nodeToExpr(node: Node): Expression = node match {
     /* Attribute References */
@@ -870,10 +881,7 @@ private[hive] object HiveQl {
       nodeToExpr(qualifier) match {
         case UnresolvedAttribute(qualifierName) =>
           UnresolvedAttribute(qualifierName + "." + cleanIdentifier(attr))
-        // The precidence for . seems to be wrong, so [] binds tighter an we need to go inside to
-        // find the underlying attribute references.
-        case GetItem(UnresolvedAttribute(qualifierName), ordinal) =>
-          GetItem(UnresolvedAttribute(qualifierName + "." + cleanIdentifier(attr)), ordinal)
+        case other => GetField(other, attr)
       }
 
     /* Stars (*) */
@@ -929,11 +937,14 @@ private[hive] object HiveQl {
     case Token("-", left :: right:: Nil) => Subtract(nodeToExpr(left), nodeToExpr(right))
     case Token("*", left :: right:: Nil) => Multiply(nodeToExpr(left), nodeToExpr(right))
     case Token("/", left :: right:: Nil) => Divide(nodeToExpr(left), nodeToExpr(right))
-    case Token(DIV(), left :: right:: Nil) => Divide(nodeToExpr(left), nodeToExpr(right))
+    case Token(DIV(), left :: right:: Nil) =>
+      Cast(Divide(nodeToExpr(left), nodeToExpr(right)), LongType)
     case Token("%", left :: right:: Nil) => Remainder(nodeToExpr(left), nodeToExpr(right))
 
     /* Comparisons */
     case Token("=", left :: right:: Nil) => EqualTo(nodeToExpr(left), nodeToExpr(right))
+    case Token("==", left :: right:: Nil) => EqualTo(nodeToExpr(left), nodeToExpr(right))
+    case Token("<=>", left :: right:: Nil) => EqualNullSafe(nodeToExpr(left), nodeToExpr(right))
     case Token("!=", left :: right:: Nil) => Not(EqualTo(nodeToExpr(left), nodeToExpr(right)))
     case Token("<>", left :: right:: Nil) => Not(EqualTo(nodeToExpr(left), nodeToExpr(right)))
     case Token(">", left :: right:: Nil) => GreaterThan(nodeToExpr(left), nodeToExpr(right))
@@ -987,6 +998,10 @@ private[hive] object HiveQl {
 
     /* Other functions */
     case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand
+    case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) => 
+      Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType))
+    case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => 
+      Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length))
 
     /* UDFs - Must be last otherwise will preempt built in functions */
     case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 4d0fab4140b21..2175c5f3835a6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -81,6 +81,8 @@ private[hive] trait HiveStrategies {
       case logical.NativeCommand(sql) =>
         NativeCommand(sql, plan.output)(context) :: Nil
 
+      case DropTable(tableName, ifExists) => execution.DropTable(tableName, ifExists) :: Nil
+
       case describe: logical.DescribeCommand =>
         val resolvedTable = context.executePlan(describe.table).analyzed
         resolvedTable match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 8cfde46186ca4..82c88280d7754 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -24,6 +24,8 @@ import org.apache.hadoop.hive.ql.exec.Utilities
 import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable}
 import org.apache.hadoop.hive.ql.plan.TableDesc
 import org.apache.hadoop.hive.serde2.Deserializer
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
+
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf}
 
@@ -31,13 +33,16 @@ import org.apache.spark.SerializableWritable
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, RDD, UnionRDD}
 
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Row, GenericMutableRow, Literal, Cast}
+import org.apache.spark.sql.catalyst.types.DataType
+
 /**
  * A trait for subclasses that handle table scans.
  */
 private[hive] sealed trait TableReader {
-  def makeRDDForTable(hiveTable: HiveTable): RDD[_]
+  def makeRDDForTable(hiveTable: HiveTable): RDD[Row]
 
-  def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[_]
+  def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[Row]
 }
 
 
@@ -46,7 +51,10 @@ private[hive] sealed trait TableReader {
  * data warehouse directory.
  */
 private[hive]
-class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveContext)
+class HadoopTableReader(
+    @transient attributes: Seq[Attribute],
+    @transient relation: MetastoreRelation,
+    @transient sc: HiveContext)
   extends TableReader {
 
   // Choose the minimum number of splits. If mapred.map.tasks is set, then use that unless
@@ -63,10 +71,10 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
 
   def hiveConf = _broadcastedHiveConf.value.value
 
-  override def makeRDDForTable(hiveTable: HiveTable): RDD[_] =
+  override def makeRDDForTable(hiveTable: HiveTable): RDD[Row] =
     makeRDDForTable(
       hiveTable,
-      _tableDesc.getDeserializerClass.asInstanceOf[Class[Deserializer]],
+      relation.tableDesc.getDeserializerClass.asInstanceOf[Class[Deserializer]],
       filterOpt = None)
 
   /**
@@ -81,14 +89,14 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
   def makeRDDForTable(
       hiveTable: HiveTable,
       deserializerClass: Class[_ <: Deserializer],
-      filterOpt: Option[PathFilter]): RDD[_] = {
+      filterOpt: Option[PathFilter]): RDD[Row] = {
 
     assert(!hiveTable.isPartitioned, """makeRDDForTable() cannot be called on a partitioned table,
       since input formats may differ across partitions. Use makeRDDForTablePartitions() instead.""")
 
     // Create local references to member variables, so that the entire `this` object won't be
     // serialized in the closure below.
-    val tableDesc = _tableDesc
+    val tableDesc = relation.tableDesc
     val broadcastedHiveConf = _broadcastedHiveConf
 
     val tablePath = hiveTable.getPath
@@ -99,23 +107,20 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
       .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
     val hadoopRDD = createHadoopRdd(tableDesc, inputPathStr, ifc)
 
+    val attrsWithIndex = attributes.zipWithIndex
+    val mutableRow = new GenericMutableRow(attrsWithIndex.length)
     val deserializedHadoopRDD = hadoopRDD.mapPartitions { iter =>
       val hconf = broadcastedHiveConf.value.value
       val deserializer = deserializerClass.newInstance()
       deserializer.initialize(hconf, tableDesc.getProperties)
 
-      // Deserialize each Writable to get the row value.
-      iter.map {
-        case v: Writable => deserializer.deserialize(v)
-        case value =>
-          sys.error(s"Unable to deserialize non-Writable: $value of ${value.getClass.getName}")
-      }
+      HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow)
     }
 
     deserializedHadoopRDD
   }
 
-  override def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[_] = {
+  override def makeRDDForPartitionedTable(partitions: Seq[HivePartition]): RDD[Row] = {
     val partitionToDeserializer = partitions.map(part =>
       (part, part.getDeserializer.getClass.asInstanceOf[Class[Deserializer]])).toMap
     makeRDDForPartitionedTable(partitionToDeserializer, filterOpt = None)
@@ -132,9 +137,9 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
    *     subdirectory of each partition being read. If None, then all files are accepted.
    */
   def makeRDDForPartitionedTable(
-      partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]],
-      filterOpt: Option[PathFilter]): RDD[_] = {
-
+      partitionToDeserializer: Map[HivePartition,
+      Class[_ <: Deserializer]],
+      filterOpt: Option[PathFilter]): RDD[Row] = {
     val hivePartitionRDDs = partitionToDeserializer.map { case (partition, partDeserializer) =>
       val partDesc = Utilities.getPartitionDesc(partition)
       val partPath = partition.getPartitionPath
@@ -156,29 +161,42 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
       }
 
       // Create local references so that the outer object isn't serialized.
-      val tableDesc = _tableDesc
+      val tableDesc = relation.tableDesc
       val broadcastedHiveConf = _broadcastedHiveConf
       val localDeserializer = partDeserializer
+      val mutableRow = new GenericMutableRow(attributes.length)
+
+      // split the attributes (output schema) into 2 categories:
+      // (partition keys, ordinal), (normal attributes, ordinal), the ordinal mean the 
+      // index of the attribute in the output Row.
+      val (partitionKeys, attrs) = attributes.zipWithIndex.partition(attr => {
+        relation.partitionKeys.indexOf(attr._1) >= 0
+      })
+
+      def fillPartitionKeys(parts: Array[String], row: GenericMutableRow) = {
+        partitionKeys.foreach { case (attr, ordinal) =>
+          // get partition key ordinal for a given attribute
+          val partOridinal = relation.partitionKeys.indexOf(attr)
+          row(ordinal) = Cast(Literal(parts(partOridinal)), attr.dataType).eval(null)
+        }
+      }
+      // fill the partition key for the given MutableRow Object
+      fillPartitionKeys(partValues, mutableRow)
 
       val hivePartitionRDD = createHadoopRdd(tableDesc, inputPathStr, ifc)
       hivePartitionRDD.mapPartitions { iter =>
         val hconf = broadcastedHiveConf.value.value
-        val rowWithPartArr = new Array[Object](2)
-        // Map each tuple to a row object
-        iter.map { value =>
-          val deserializer = localDeserializer.newInstance()
-          deserializer.initialize(hconf, partProps)
-          val deserializedRow = deserializer.deserialize(value)
-          rowWithPartArr.update(0, deserializedRow)
-          rowWithPartArr.update(1, partValues)
-          rowWithPartArr.asInstanceOf[Object]
-        }
+        val deserializer = localDeserializer.newInstance()
+        deserializer.initialize(hconf, partProps)
+
+        // fill the non partition key attributes 
+        HadoopTableReader.fillObject(iter, deserializer, attrs, mutableRow)
       }
     }.toSeq
 
     // Even if we don't use any partitions, we still need an empty RDD
     if (hivePartitionRDDs.size == 0) {
-      new EmptyRDD[Object](sc.sparkContext)
+      new EmptyRDD[Row](sc.sparkContext)
     } else {
       new UnionRDD(hivePartitionRDDs(0).context, hivePartitionRDDs)
     }
@@ -221,10 +239,9 @@ class HadoopTableReader(@transient _tableDesc: TableDesc, @transient sc: HiveCon
     // Only take the value (skip the key) because Hive works only with values.
     rdd.map(_._2)
   }
-
 }
 
-private[hive] object HadoopTableReader {
+private[hive] object HadoopTableReader extends HiveInspectors {
   /**
    * Curried. After given an argument for 'path', the resulting JobConf => Unit closure is used to
    * instantiate a HadoopRDD.
@@ -237,4 +254,40 @@ private[hive] object HadoopTableReader {
     val bufferSize = System.getProperty("spark.buffer.size", "65536")
     jobConf.set("io.file.buffer.size", bufferSize)
   }
+
+  /**
+   * Transform the raw data(Writable object) into the Row object for an iterable input
+   * @param iter Iterable input which represented as Writable object
+   * @param deserializer Deserializer associated with the input writable object
+   * @param attrs Represents the row attribute names and its zero-based position in the MutableRow
+   * @param row reusable MutableRow object
+   * 
+   * @return Iterable Row object that transformed from the given iterable input.
+   */
+  def fillObject(
+      iter: Iterator[Writable],
+      deserializer: Deserializer,
+      attrs: Seq[(Attribute, Int)],
+      row: GenericMutableRow): Iterator[Row] = {
+    val soi = deserializer.getObjectInspector().asInstanceOf[StructObjectInspector]
+    // get the field references according to the attributes(output of the reader) required
+    val fieldRefs = attrs.map { case (attr, idx) => (soi.getStructFieldRef(attr.name), idx) }
+
+    // Map each tuple to a row object
+    iter.map { value =>
+      val raw = deserializer.deserialize(value)
+      var idx = 0;
+      while (idx < fieldRefs.length) {
+        val fieldRef = fieldRefs(idx)._1
+        val fieldIdx = fieldRefs(idx)._2
+        val fieldValue = soi.getStructFieldData(raw, fieldRef)
+
+        row(fieldIdx) = unwrapData(fieldValue, fieldRef.getFieldObjectInspector())
+
+        idx += 1
+      }
+
+      row: Row
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index 9386008d02d51..c605e8adcfb0f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -53,15 +53,24 @@ object TestHive
  * hive metastore seems to lead to weird non-deterministic failures.  Therefore, the execution of
  * test cases that rely on TestHive must be serialized.
  */
-class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
+class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   self =>
 
   // By clearing the port we force Spark to pick a new one.  This allows us to rerun tests
   // without restarting the JVM.
   System.clearProperty("spark.hostPort")
 
-  override lazy val warehousePath = getTempFilePath("sparkHiveWarehouse").getCanonicalPath
-  override lazy val metastorePath = getTempFilePath("sparkHiveMetastore").getCanonicalPath
+  lazy val warehousePath = getTempFilePath("sparkHiveWarehouse").getCanonicalPath
+  lazy val metastorePath = getTempFilePath("sparkHiveMetastore").getCanonicalPath
+
+  /** Sets up the system initially or after a RESET command */
+  protected def configure() {
+    set("javax.jdo.option.ConnectionURL",
+      s"jdbc:derby:;databaseName=$metastorePath;create=true")
+    set("hive.metastore.warehouse.dir", warehousePath)
+  }
+
+  configure() // Must be called before initializing the catalog below.
 
   /** The location of the compiled hive distribution */
   lazy val hiveHome = envVarToFile("HIVE_HOME")
@@ -139,7 +148,7 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
         describedTables ++
         logical.collect { case UnresolvedRelation(databaseName, name, _) => name }
       val referencedTestTables = referencedTables.filter(testTables.contains)
-      logger.debug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
+      logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
       referencedTestTables.foreach(loadTestTable)
       // Proceed with analysis.
       analyzer(logical)
@@ -264,7 +273,7 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
     if (!(loadedTables contains name)) {
       // Marks the table as loaded first to prevent infite mutually recursive table loading.
       loadedTables += name
-      logger.info(s"Loading test table $name")
+      logInfo(s"Loading test table $name")
       val createCmds =
         testTables.get(name).map(_.commands).getOrElse(sys.error(s"Unknown test table $name"))
       createCmds.foreach(_())
@@ -288,8 +297,8 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
   def reset() {
     try {
       // HACK: Hive is too noisy by default.
-      org.apache.log4j.LogManager.getCurrentLoggers.foreach { logger =>
-        logger.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
+      org.apache.log4j.LogManager.getCurrentLoggers.foreach { log =>
+        log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
       }
 
       // It is important that we RESET first as broken hooks that might have been set could break
@@ -303,7 +312,7 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
 
       loadedTables.clear()
       catalog.client.getAllTables("default").foreach { t =>
-        logger.debug(s"Deleting table $t")
+        logDebug(s"Deleting table $t")
         val table = catalog.client.getTable("default", t)
 
         catalog.client.getIndexes("default", t, 255).foreach { index =>
@@ -316,7 +325,7 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
       }
 
       catalog.client.getAllDatabases.filterNot(_ == "default").foreach { db =>
-        logger.debug(s"Dropping Database: $db")
+        logDebug(s"Dropping Database: $db")
         catalog.client.dropDatabase(db, true, false, true)
       }
 
@@ -338,7 +347,7 @@ class TestHiveContext(sc: SparkContext) extends LocalHiveContext(sc) {
       loadTestTable("srcpart")
     } catch {
       case e: Exception =>
-        logger.error(s"FATAL ERROR: Failed to reset TestDB state. $e")
+        logError(s"FATAL ERROR: Failed to reset TestDB state. $e")
         // At this point there is really no reason to continue, but the test framework traps exits.
         // So instead we just pause forever so that at least the developer can see where things
         // started to go wrong.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
index c9ee162191c96..a201d2349a2ef 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/api/java/JavaHiveContext.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.api.java
 
 import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD}
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.{HiveContext, HiveQl}
 
 /**
@@ -28,9 +29,21 @@ class JavaHiveContext(sparkContext: JavaSparkContext) extends JavaSQLContext(spa
 
   override val sqlContext = new HiveContext(sparkContext)
 
+  override def sql(sqlText: String): JavaSchemaRDD = {
+    // TODO: Create a framework for registering parsers instead of just hardcoding if statements.
+    if (sqlContext.dialect == "sql") {
+      super.sql(sqlText)
+    } else if (sqlContext.dialect == "hiveql") {
+      new JavaSchemaRDD(sqlContext, HiveQl.parseSql(sqlText))
+    }  else {
+      sys.error(s"Unsupported SQL dialect: ${sqlContext.dialect}.  Try 'sql' or 'hiveql'")
+    }
+  }
+
   /**
-    * Executes a query expressed in HiveQL, returning the result as a JavaSchemaRDD.
+    * DEPRECATED: Use sql(...) Instead
     */
+  @Deprecated
   def hql(hqlQuery: String): JavaSchemaRDD =
     new JavaSchemaRDD(sqlContext, HiveQl.parseSql(hqlQuery))
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
new file mode 100644
index 0000000000000..9cd0c86c6c796
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DropTable.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions.Row
+import org.apache.spark.sql.execution.{Command, LeafNode}
+import org.apache.spark.sql.hive.HiveContext
+
+/**
+ * :: DeveloperApi ::
+ * Drops a table from the metastore and removes it if it is cached.
+ */
+@DeveloperApi
+case class DropTable(tableName: String, ifExists: Boolean) extends LeafNode with Command {
+
+  def hiveContext = sqlContext.asInstanceOf[HiveContext]
+
+  def output = Seq.empty
+
+  override protected[sql] lazy val sideEffectResult: Seq[Any] = {
+    val ifExistsClause = if (ifExists) "IF EXISTS " else ""
+    hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
+    hiveContext.catalog.unregisterTable(None, tableName)
+    Seq.empty
+  }
+
+  override def execute(): RDD[Row] = {
+    sideEffectResult
+    sparkContext.emptyRDD[Row]
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index ef8bae74530ec..8920e2a76a27f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types.{BooleanType, DataType}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive._
-import org.apache.spark.util.MutablePair
 
 /**
  * :: DeveloperApi ::
@@ -50,8 +49,7 @@ case class HiveTableScan(
     relation: MetastoreRelation,
     partitionPruningPred: Option[Expression])(
     @transient val context: HiveContext)
-  extends LeafNode
-  with HiveInspectors {
+  extends LeafNode {
 
   require(partitionPruningPred.isEmpty || relation.hiveQlTable.isPartitioned,
     "Partition pruning predicates only supported for partitioned tables.")
@@ -67,52 +65,7 @@ case class HiveTableScan(
   }
 
   @transient
-  private[this] val hadoopReader = new HadoopTableReader(relation.tableDesc, context)
-
-  /**
-   * The hive object inspector for this table, which can be used to extract values from the
-   * serialized row representation.
-   */
-  @transient
-  private[this] lazy val objectInspector =
-    relation.tableDesc.getDeserializer.getObjectInspector.asInstanceOf[StructObjectInspector]
-
-  /**
-   * Functions that extract the requested attributes from the hive output.  Partitioned values are
-   * casted from string to its declared data type.
-   */
-  @transient
-  protected lazy val attributeFunctions: Seq[(Any, Array[String]) => Any] = {
-    attributes.map { a =>
-      val ordinal = relation.partitionKeys.indexOf(a)
-      if (ordinal >= 0) {
-        val dataType = relation.partitionKeys(ordinal).dataType
-        (_: Any, partitionKeys: Array[String]) => {
-          castFromString(partitionKeys(ordinal), dataType)
-        }
-      } else {
-        val ref = objectInspector.getAllStructFieldRefs
-          .find(_.getFieldName == a.name)
-          .getOrElse(sys.error(s"Can't find attribute $a"))
-        val fieldObjectInspector = ref.getFieldObjectInspector
-
-        val unwrapHiveData = fieldObjectInspector match {
-          case _: HiveVarcharObjectInspector =>
-            (value: Any) => value.asInstanceOf[HiveVarchar].getValue
-          case _: HiveDecimalObjectInspector =>
-            (value: Any) => BigDecimal(value.asInstanceOf[HiveDecimal].bigDecimalValue())
-          case _ =>
-            identity[Any] _
-        }
-
-        (row: Any, _: Array[String]) => {
-          val data = objectInspector.getStructFieldData(row, ref)
-          val hiveData = unwrapData(data, fieldObjectInspector)
-          if (hiveData != null) unwrapHiveData(hiveData) else null
-        }
-      }
-    }
-  }
+  private[this] val hadoopReader = new HadoopTableReader(attributes, relation, context)
 
   private[this] def castFromString(value: String, dataType: DataType) = {
     Cast(Literal(value), dataType).eval(null)
@@ -124,6 +77,7 @@ case class HiveTableScan(
     val columnInternalNames = neededColumnIDs.map(HiveConf.getColumnInternalName(_)).mkString(",")
 
     if (attributes.size == relation.output.size) {
+      // SQLContext#pruneFilterProject guarantees no duplicated value in `attributes`
       ColumnProjectionUtils.setFullyReadColumns(hiveConf)
     } else {
       ColumnProjectionUtils.appendReadColumnIDs(hiveConf, neededColumnIDs)
@@ -150,12 +104,6 @@ case class HiveTableScan(
 
   addColumnMetadataToConf(context.hiveconf)
 
-  private def inputRdd = if (!relation.hiveQlTable.isPartitioned) {
-    hadoopReader.makeRDDForTable(relation.hiveQlTable)
-  } else {
-    hadoopReader.makeRDDForPartitionedTable(prunePartitions(relation.hiveQlPartitions))
-  }
-
   /**
    * Prunes partitions not involve the query plan.
    *
@@ -179,44 +127,10 @@ case class HiveTableScan(
     }
   }
 
-  override def execute() = {
-    inputRdd.mapPartitions { iterator =>
-      if (iterator.isEmpty) {
-        Iterator.empty
-      } else {
-        val mutableRow = new GenericMutableRow(attributes.length)
-        val mutablePair = new MutablePair[Any, Array[String]]()
-        val buffered = iterator.buffered
-
-        // NOTE (lian): Critical path of Hive table scan, unnecessary FP style code and pattern
-        // matching are avoided intentionally.
-        val rowsAndPartitionKeys = buffered.head match {
-          // With partition keys
-          case _: Array[Any] =>
-            buffered.map { case array: Array[Any] =>
-              val deserializedRow = array(0)
-              val partitionKeys = array(1).asInstanceOf[Array[String]]
-              mutablePair.update(deserializedRow, partitionKeys)
-            }
-
-          // Without partition keys
-          case _ =>
-            val emptyPartitionKeys = Array.empty[String]
-            buffered.map { deserializedRow =>
-              mutablePair.update(deserializedRow, emptyPartitionKeys)
-            }
-        }
-
-        rowsAndPartitionKeys.map { pair =>
-          var i = 0
-          while (i < attributes.length) {
-            mutableRow(i) = attributeFunctions(i)(pair._1, pair._2)
-            i += 1
-          }
-          mutableRow: Row
-        }
-      }
-    }
+  override def execute() = if (!relation.hiveQlTable.isPartitioned) {
+    hadoopReader.makeRDDForTable(relation.hiveQlTable)
+  } else {
+    hadoopReader.makeRDDForPartitionedTable(prunePartitions(relation.hiveQlPartitions))
   }
 
   override def output = attributes
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index c2b0b00aa5852..39033bdeac4b0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -131,7 +131,7 @@ case class InsertIntoHiveTable(
       conf,
       SparkHiveHadoopWriter.createPathFromString(fileSinkConf.getDirName, conf))
 
-    logger.debug("Saving as hadoop file of type " + valueClass.getSimpleName)
+    log.debug("Saving as hadoop file of type " + valueClass.getSimpleName)
 
     val writer = new SparkHiveHadoopWriter(conf, fileSinkConf)
     writer.preSetup()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 8258ee5fef0eb..0c8f676e9c5c8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -67,7 +67,7 @@ case class ScriptTransformation(
         }
       }
       readerThread.start()
-      val outputProjection = new Projection(input)
+      val outputProjection = new InterpretedProjection(input, child.output)
       iter
         .map(outputProjection)
         // TODO: Use SerDe
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 9b105308ab7cf..179aac5cbd5cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -24,22 +24,20 @@ import org.apache.hadoop.hive.ql.exec.UDF
 import org.apache.hadoop.hive.ql.exec.{FunctionInfo, FunctionRegistry}
 import org.apache.hadoop.hive.ql.udf.{UDFType => HiveUDFType}
 import org.apache.hadoop.hive.ql.udf.generic._
-import org.apache.hadoop.hive.serde2.objectinspector._
-import org.apache.hadoop.hive.serde2.objectinspector.primitive._
-import org.apache.hadoop.hive.serde2.{io => hiveIo}
-import org.apache.hadoop.{io => hadoopIo}
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.types
 import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.util.Utils.getContextOrSparkClassLoader
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
 
-private[hive] object HiveFunctionRegistry
-  extends analysis.FunctionRegistry with HiveFunctionFactory with HiveInspectors {
+private[hive] abstract class HiveFunctionRegistry
+  extends analysis.FunctionRegistry with HiveInspectors {
+
+  def getFunctionInfo(name: String) = FunctionRegistry.getFunctionInfo(name)
 
   def lookupFunction(name: String, children: Seq[Expression]): Expression = {
     // We only look it up to see if it exists, but do not include it in the HiveUDF since it is
@@ -47,111 +45,37 @@ private[hive] object HiveFunctionRegistry
     val functionInfo: FunctionInfo = Option(FunctionRegistry.getFunctionInfo(name)).getOrElse(
       sys.error(s"Couldn't find function $name"))
 
+    val functionClassName = functionInfo.getFunctionClass.getName()
+
     if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      val function = createFunction[UDF](name)
+      val function = functionInfo.getFunctionClass.newInstance().asInstanceOf[UDF]
       val method = function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo))
 
       lazy val expectedDataTypes = method.getParameterTypes.map(javaClassToDataType)
 
       HiveSimpleUdf(
-        name,
+        functionClassName,
         children.zip(expectedDataTypes).map { case (e, t) => Cast(e, t) }
       )
     } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveGenericUdf(name, children)
+      HiveGenericUdf(functionClassName, children)
     } else if (
          classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveGenericUdaf(name, children)
+      HiveGenericUdaf(functionClassName, children)
 
     } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
-      HiveGenericUdtf(name, Nil, children)
+      HiveGenericUdtf(functionClassName, Nil, children)
     } else {
       sys.error(s"No handler for udf ${functionInfo.getFunctionClass}")
     }
   }
-
-  def javaClassToDataType(clz: Class[_]): DataType = clz match {
-    // writable
-    case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType
-    case c: Class[_] if c == classOf[hiveIo.DoubleWritable] => DoubleType
-    case c: Class[_] if c == classOf[hiveIo.HiveDecimalWritable] => DecimalType
-    case c: Class[_] if c == classOf[hiveIo.ByteWritable] => ByteType
-    case c: Class[_] if c == classOf[hiveIo.ShortWritable] => ShortType
-    case c: Class[_] if c == classOf[hiveIo.TimestampWritable] => TimestampType
-    case c: Class[_] if c == classOf[hadoopIo.Text] => StringType
-    case c: Class[_] if c == classOf[hadoopIo.IntWritable] => IntegerType
-    case c: Class[_] if c == classOf[hadoopIo.LongWritable] => LongType
-    case c: Class[_] if c == classOf[hadoopIo.FloatWritable] => FloatType
-    case c: Class[_] if c == classOf[hadoopIo.BooleanWritable] => BooleanType
-    case c: Class[_] if c == classOf[hadoopIo.BytesWritable] => BinaryType
-
-    // java class
-    case c: Class[_] if c == classOf[java.lang.String] => StringType
-    case c: Class[_] if c == classOf[java.sql.Timestamp] => TimestampType
-    case c: Class[_] if c == classOf[HiveDecimal] => DecimalType
-    case c: Class[_] if c == classOf[java.math.BigDecimal] => DecimalType
-    case c: Class[_] if c == classOf[Array[Byte]] => BinaryType
-    case c: Class[_] if c == classOf[java.lang.Short] => ShortType
-    case c: Class[_] if c == classOf[java.lang.Integer] => IntegerType
-    case c: Class[_] if c == classOf[java.lang.Long] => LongType
-    case c: Class[_] if c == classOf[java.lang.Double] => DoubleType
-    case c: Class[_] if c == classOf[java.lang.Byte] => ByteType
-    case c: Class[_] if c == classOf[java.lang.Float] => FloatType
-    case c: Class[_] if c == classOf[java.lang.Boolean] => BooleanType
-
-    // primitive type
-    case c: Class[_] if c == java.lang.Short.TYPE => ShortType
-    case c: Class[_] if c == java.lang.Integer.TYPE => IntegerType
-    case c: Class[_] if c == java.lang.Long.TYPE => LongType
-    case c: Class[_] if c == java.lang.Double.TYPE => DoubleType
-    case c: Class[_] if c == java.lang.Byte.TYPE => ByteType
-    case c: Class[_] if c == java.lang.Float.TYPE => FloatType
-    case c: Class[_] if c == java.lang.Boolean.TYPE => BooleanType
-
-    case c: Class[_] if c.isArray => ArrayType(javaClassToDataType(c.getComponentType))
-  }
 }
 
 private[hive] trait HiveFunctionFactory {
-  def getFunctionInfo(name: String) = FunctionRegistry.getFunctionInfo(name)
-  def getFunctionClass(name: String) = getFunctionInfo(name).getFunctionClass
-  def createFunction[UDFType](name: String) =
-    getFunctionClass(name).newInstance.asInstanceOf[UDFType]
-
-  /** Converts hive types to native catalyst types. */
-  def unwrap(a: Any): Any = a match {
-    case null => null
-    case i: hadoopIo.IntWritable => i.get
-    case t: hadoopIo.Text => t.toString
-    case l: hadoopIo.LongWritable => l.get
-    case d: hadoopIo.DoubleWritable => d.get
-    case d: hiveIo.DoubleWritable => d.get
-    case s: hiveIo.ShortWritable => s.get
-    case b: hadoopIo.BooleanWritable => b.get
-    case b: hiveIo.ByteWritable => b.get
-    case b: hadoopIo.FloatWritable => b.get
-    case b: hadoopIo.BytesWritable => {
-      val bytes = new Array[Byte](b.getLength)
-      System.arraycopy(b.getBytes(), 0, bytes, 0, b.getLength)
-      bytes
-    }
-    case t: hiveIo.TimestampWritable => t.getTimestamp
-    case b: hiveIo.HiveDecimalWritable => BigDecimal(b.getHiveDecimal().bigDecimalValue())
-    case list: java.util.List[_] => list.map(unwrap)
-    case map: java.util.Map[_,_] => map.map { case (k, v) => (unwrap(k), unwrap(v)) }.toMap
-    case array: Array[_] => array.map(unwrap).toSeq
-    case p: java.lang.Short => p
-    case p: java.lang.Long => p
-    case p: java.lang.Float => p
-    case p: java.lang.Integer => p
-    case p: java.lang.Double => p
-    case p: java.lang.Byte => p
-    case p: java.lang.Boolean => p
-    case str: String => str
-    case p: java.math.BigDecimal => p
-    case p: Array[Byte] => p
-    case p: java.sql.Timestamp => p
-  }
+  val functionClassName: String
+
+  def createFunction[UDFType]() =
+    getContextOrSparkClassLoader.loadClass(functionClassName).newInstance.asInstanceOf[UDFType]
 }
 
 private[hive] abstract class HiveUdf extends Expression with Logging with HiveFunctionFactory {
@@ -160,20 +84,17 @@ private[hive] abstract class HiveUdf extends Expression with Logging with HiveFu
   type UDFType
   type EvaluatedType = Any
 
-  val name: String
-
   def nullable = true
   def references = children.flatMap(_.references).toSet
 
-  // FunctionInfo is not serializable so we must look it up here again.
-  lazy val functionInfo = getFunctionInfo(name)
-  lazy val function = createFunction[UDFType](name)
+  lazy val function = createFunction[UDFType]()
 
-  override def toString = s"$nodeName#${functionInfo.getDisplayName}(${children.mkString(",")})"
+  override def toString = s"$nodeName#$functionClassName(${children.mkString(",")})"
 }
 
-private[hive] case class HiveSimpleUdf(name: String, children: Seq[Expression]) extends HiveUdf {
-  import org.apache.spark.sql.hive.HiveFunctionRegistry._
+private[hive] case class HiveSimpleUdf(functionClassName: String, children: Seq[Expression])
+  extends HiveUdf with HiveInspectors {
+
   type UDFType = UDF
 
   @transient
@@ -198,7 +119,7 @@ private[hive] case class HiveSimpleUdf(name: String, children: Seq[Expression])
       sys.error(s"No matching wrapper found, options: ${argClass.getConstructors.toSeq}."))
 
     (a: Any) => {
-      logger.debug(
+      logDebug(
         s"Wrapping $a of type ${if (a == null) "null" else a.getClass.getName} using $constructor.")
       // We must make sure that primitives get boxed java style.
       if (a == null) {
@@ -226,7 +147,7 @@ private[hive] case class HiveSimpleUdf(name: String, children: Seq[Expression])
   }
 }
 
-private[hive] case class HiveGenericUdf(name: String, children: Seq[Expression])
+private[hive] case class HiveGenericUdf(functionClassName: String, children: Seq[Expression])
   extends HiveUdf with HiveInspectors {
 
   import org.apache.hadoop.hive.ql.udf.generic.GenericUDF._
@@ -277,127 +198,8 @@ private[hive] case class HiveGenericUdf(name: String, children: Seq[Expression])
   }
 }
 
-private[hive] trait HiveInspectors {
-
-  def unwrapData(data: Any, oi: ObjectInspector): Any = oi match {
-    case pi: PrimitiveObjectInspector => pi.getPrimitiveJavaObject(data)
-    case li: ListObjectInspector =>
-      Option(li.getList(data))
-        .map(_.map(unwrapData(_, li.getListElementObjectInspector)).toSeq)
-        .orNull
-    case mi: MapObjectInspector =>
-      Option(mi.getMap(data)).map(
-        _.map {
-          case (k,v) =>
-            (unwrapData(k, mi.getMapKeyObjectInspector),
-              unwrapData(v, mi.getMapValueObjectInspector))
-        }.toMap).orNull
-    case si: StructObjectInspector =>
-      val allRefs = si.getAllStructFieldRefs
-      new GenericRow(
-        allRefs.map(r =>
-          unwrapData(si.getStructFieldData(data,r), r.getFieldObjectInspector)).toArray)
-  }
-
-  /** Converts native catalyst types to the types expected by Hive */
-  def wrap(a: Any): AnyRef = a match {
-    case s: String => new hadoopIo.Text(s) // TODO why should be Text?
-    case i: Int => i: java.lang.Integer
-    case b: Boolean => b: java.lang.Boolean
-    case f: Float => f: java.lang.Float
-    case d: Double => d: java.lang.Double
-    case l: Long => l: java.lang.Long
-    case l: Short => l: java.lang.Short
-    case l: Byte => l: java.lang.Byte
-    case b: BigDecimal => b.bigDecimal
-    case b: Array[Byte] => b
-    case t: java.sql.Timestamp => t
-    case s: Seq[_] => seqAsJavaList(s.map(wrap))
-    case m: Map[_,_] =>
-      mapAsJavaMap(m.map { case (k, v) => wrap(k) -> wrap(v) })
-    case null => null
-  }
-
-  def toInspector(dataType: DataType): ObjectInspector = dataType match {
-    case ArrayType(tpe) => ObjectInspectorFactory.getStandardListObjectInspector(toInspector(tpe))
-    case MapType(keyType, valueType) =>
-      ObjectInspectorFactory.getStandardMapObjectInspector(
-        toInspector(keyType), toInspector(valueType))
-    case StringType => PrimitiveObjectInspectorFactory.javaStringObjectInspector
-    case IntegerType => PrimitiveObjectInspectorFactory.javaIntObjectInspector
-    case DoubleType => PrimitiveObjectInspectorFactory.javaDoubleObjectInspector
-    case BooleanType => PrimitiveObjectInspectorFactory.javaBooleanObjectInspector
-    case LongType => PrimitiveObjectInspectorFactory.javaLongObjectInspector
-    case FloatType => PrimitiveObjectInspectorFactory.javaFloatObjectInspector
-    case ShortType => PrimitiveObjectInspectorFactory.javaShortObjectInspector
-    case ByteType => PrimitiveObjectInspectorFactory.javaByteObjectInspector
-    case NullType => PrimitiveObjectInspectorFactory.javaVoidObjectInspector
-    case BinaryType => PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector
-    case TimestampType => PrimitiveObjectInspectorFactory.javaTimestampObjectInspector
-    case DecimalType => PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector
-    case StructType(fields) =>
-      ObjectInspectorFactory.getStandardStructObjectInspector(
-        fields.map(f => f.name), fields.map(f => toInspector(f.dataType)))
-  }
-
-  def inspectorToDataType(inspector: ObjectInspector): DataType = inspector match {
-    case s: StructObjectInspector =>
-      StructType(s.getAllStructFieldRefs.map(f => {
-        types.StructField(
-          f.getFieldName, inspectorToDataType(f.getFieldObjectInspector), nullable = true)
-      }))
-    case l: ListObjectInspector => ArrayType(inspectorToDataType(l.getListElementObjectInspector))
-    case m: MapObjectInspector =>
-      MapType(
-        inspectorToDataType(m.getMapKeyObjectInspector),
-        inspectorToDataType(m.getMapValueObjectInspector))
-    case _: WritableStringObjectInspector => StringType
-    case _: JavaStringObjectInspector => StringType
-    case _: WritableIntObjectInspector => IntegerType
-    case _: JavaIntObjectInspector => IntegerType
-    case _: WritableDoubleObjectInspector => DoubleType
-    case _: JavaDoubleObjectInspector => DoubleType
-    case _: WritableBooleanObjectInspector => BooleanType
-    case _: JavaBooleanObjectInspector => BooleanType
-    case _: WritableLongObjectInspector => LongType
-    case _: JavaLongObjectInspector => LongType
-    case _: WritableShortObjectInspector => ShortType
-    case _: JavaShortObjectInspector => ShortType
-    case _: WritableByteObjectInspector => ByteType
-    case _: JavaByteObjectInspector => ByteType
-    case _: WritableFloatObjectInspector => FloatType
-    case _: JavaFloatObjectInspector => FloatType
-    case _: WritableBinaryObjectInspector => BinaryType
-    case _: JavaBinaryObjectInspector => BinaryType
-    case _: WritableHiveDecimalObjectInspector => DecimalType
-    case _: JavaHiveDecimalObjectInspector => DecimalType
-    case _: WritableTimestampObjectInspector => TimestampType
-    case _: JavaTimestampObjectInspector => TimestampType
-  }
-
-  implicit class typeInfoConversions(dt: DataType) {
-    import org.apache.hadoop.hive.serde2.typeinfo._
-    import TypeInfoFactory._
-
-    def toTypeInfo: TypeInfo = dt match {
-      case BinaryType => binaryTypeInfo
-      case BooleanType => booleanTypeInfo
-      case ByteType => byteTypeInfo
-      case DoubleType => doubleTypeInfo
-      case FloatType => floatTypeInfo
-      case IntegerType => intTypeInfo
-      case LongType => longTypeInfo
-      case ShortType => shortTypeInfo
-      case StringType => stringTypeInfo
-      case DecimalType => decimalTypeInfo
-      case TimestampType => timestampTypeInfo
-      case NullType => voidTypeInfo
-    }
-  }
-}
-
 private[hive] case class HiveGenericUdaf(
-    name: String,
+    functionClassName: String,
     children: Seq[Expression]) extends AggregateExpression
   with HiveInspectors
   with HiveFunctionFactory {
@@ -405,7 +207,7 @@ private[hive] case class HiveGenericUdaf(
   type UDFType = AbstractGenericUDAFResolver
 
   @transient
-  protected lazy val resolver: AbstractGenericUDAFResolver = createFunction(name)
+  protected lazy val resolver: AbstractGenericUDAFResolver = createFunction()
 
   @transient
   protected lazy val objectInspector  = {
@@ -422,9 +224,9 @@ private[hive] case class HiveGenericUdaf(
 
   def references: Set[Attribute] = children.map(_.references).flatten.toSet
 
-  override def toString = s"$nodeName#$name(${children.mkString(",")})"
+  override def toString = s"$nodeName#$functionClassName(${children.mkString(",")})"
 
-  def newInstance() = new HiveUdafFunction(name, children, this)
+  def newInstance() = new HiveUdafFunction(functionClassName, children, this)
 }
 
 /**
@@ -439,7 +241,7 @@ private[hive] case class HiveGenericUdaf(
  * user defined aggregations, which have clean semantics even in a partitioned execution.
  */
 private[hive] case class HiveGenericUdtf(
-    name: String,
+    functionClassName: String,
     aliasNames: Seq[String],
     children: Seq[Expression])
   extends Generator with HiveInspectors with HiveFunctionFactory {
@@ -447,10 +249,12 @@ private[hive] case class HiveGenericUdtf(
   override def references = children.flatMap(_.references).toSet
 
   @transient
-  protected lazy val function: GenericUDTF = createFunction(name)
+  protected lazy val function: GenericUDTF = createFunction()
 
+  @transient
   protected lazy val inputInspectors = children.map(_.dataType).map(toInspector)
 
+  @transient
   protected lazy val outputInspectors = {
     val structInspector = function.initialize(inputInspectors.toArray)
     structInspector.getAllStructFieldRefs.map(_.getFieldObjectInspector)
@@ -476,7 +280,7 @@ private[hive] case class HiveGenericUdtf(
   override def eval(input: Row): TraversableOnce[Row] = {
     outputInspectors // Make sure initialized.
 
-    val inputProjection = new Projection(children)
+    val inputProjection = new InterpretedProjection(children)
     val collector = new UDTFCollector
     function.setCollector(collector)
 
@@ -502,11 +306,11 @@ private[hive] case class HiveGenericUdtf(
     }
   }
 
-  override def toString = s"$nodeName#$name(${children.mkString(",")})"
+  override def toString = s"$nodeName#$functionClassName(${children.mkString(",")})"
 }
 
 private[hive] case class HiveUdafFunction(
-    functionName: String,
+    functionClassName: String,
     exprs: Seq[Expression],
     base: AggregateExpression)
   extends AggregateFunction
@@ -515,7 +319,7 @@ private[hive] case class HiveUdafFunction(
 
   def this() = this(null, null, null)
 
-  private val resolver = createFunction[AbstractGenericUDAFResolver](functionName)
+  private val resolver = createFunction[AbstractGenericUDAFResolver]()
 
   private val inspectors = exprs.map(_.dataType).map(toInspector).toArray
 
@@ -530,7 +334,7 @@ private[hive] case class HiveUdafFunction(
   override def eval(input: Row): Any = unwrapData(function.evaluate(buffer), returnInspector)
 
   @transient
-  val inputProjection = new Projection(exprs)
+  val inputProjection = new InterpretedProjection(exprs)
 
   def update(input: Row): Unit = {
     val inputs = inputProjection(input).asInstanceOf[Seq[AnyRef]].toArray
diff --git a/sql/hive/src/test/resources/golden/boolean = number-0-6b6975fa1892cc48edd87dc0df48a7c0 b/sql/hive/src/test/resources/golden/boolean = number-0-6b6975fa1892cc48edd87dc0df48a7c0
new file mode 100644
index 0000000000000..4d1ebdcde2c71
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/boolean = number-0-6b6975fa1892cc48edd87dc0df48a7c0	
@@ -0,0 +1 @@
+true	true	true	true	true	true	false	false	false	false	false	false	false	false	false	false	false	false	true	true	true	true	true	true	false	false	false	false	false	false	false	false	false	false	false	false
diff --git a/sql/hive/src/test/resources/golden/case else null-0-8ef2f741400830ef889a9dd0c817fe3d b/sql/hive/src/test/resources/golden/case else null-0-8ef2f741400830ef889a9dd0c817fe3d
new file mode 100644
index 0000000000000..00750edc07d64
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/case else null-0-8ef2f741400830ef889a9dd0c817fe3d	
@@ -0,0 +1 @@
+3
diff --git a/sql/hive/src/test/resources/golden/case sensitivity: Hive table-0-5d14d21a239daa42b086cc895215009a b/sql/hive/src/test/resources/golden/case sensitivity: Hive table-0-5d14d21a239daa42b086cc895215009a
new file mode 100644
index 0000000000000..4d7127c0faabc
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/case sensitivity: Hive table-0-5d14d21a239daa42b086cc895215009a	
@@ -0,0 +1,14 @@
+0	val_0
+4	val_4
+12	val_12
+8	val_8
+0	val_0
+0	val_0
+10	val_10
+5	val_5
+11	val_11
+5	val_5
+2	val_2
+12	val_12
+5	val_5
+9	val_9
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-0-b1e2ade89ae898650f0be4f796d8947b b/sql/hive/src/test/resources/golden/correlationoptimizer1-0-b1e2ade89ae898650f0be4f796d8947b
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-0-b1e2ade89ae898650f0be4f796d8947b
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-1-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-1-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-1-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-10-5d712a42dcc1c4f7c797dabda5eb7b3a b/sql/hive/src/test/resources/golden/correlationoptimizer1-10-5d712a42dcc1c4f7c797dabda5eb7b3a
new file mode 100644
index 0000000000000..58010b0040b74
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-10-5d712a42dcc1c4f7c797dabda5eb7b3a
@@ -0,0 +1 @@
+3556	37
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-11-b1e2ade89ae898650f0be4f796d8947b b/sql/hive/src/test/resources/golden/correlationoptimizer1-11-b1e2ade89ae898650f0be4f796d8947b
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-11-b1e2ade89ae898650f0be4f796d8947b
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-12-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-12-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-12-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-13-f9f839aedb3a350719c0cbc53a06ace5 b/sql/hive/src/test/resources/golden/correlationoptimizer1-13-f9f839aedb3a350719c0cbc53a06ace5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-14-dae4256e08d595317f8e09a56354a3d9 b/sql/hive/src/test/resources/golden/correlationoptimizer1-14-dae4256e08d595317f8e09a56354a3d9
new file mode 100644
index 0000000000000..235736a2807b6
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-14-dae4256e08d595317f8e09a56354a3d9
@@ -0,0 +1 @@
+3556	15
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-15-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-15-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-15-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-16-43f356b36962f2bade5706d8cf5ae6b4 b/sql/hive/src/test/resources/golden/correlationoptimizer1-16-43f356b36962f2bade5706d8cf5ae6b4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-17-dae4256e08d595317f8e09a56354a3d9 b/sql/hive/src/test/resources/golden/correlationoptimizer1-17-dae4256e08d595317f8e09a56354a3d9
new file mode 100644
index 0000000000000..235736a2807b6
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-17-dae4256e08d595317f8e09a56354a3d9
@@ -0,0 +1 @@
+3556	15
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-18-b1e2ade89ae898650f0be4f796d8947b b/sql/hive/src/test/resources/golden/correlationoptimizer1-18-b1e2ade89ae898650f0be4f796d8947b
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-18-b1e2ade89ae898650f0be4f796d8947b
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-19-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-19-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-19-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-2-42a0eedc3751f792ad5438b2c64d3897 b/sql/hive/src/test/resources/golden/correlationoptimizer1-2-42a0eedc3751f792ad5438b2c64d3897
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-20-b4c1e27a7c1f61a3e9ae07c80e6e2973 b/sql/hive/src/test/resources/golden/correlationoptimizer1-20-b4c1e27a7c1f61a3e9ae07c80e6e2973
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-21-16c57348be42ca3cc2f80f7f92265696 b/sql/hive/src/test/resources/golden/correlationoptimizer1-21-16c57348be42ca3cc2f80f7f92265696
new file mode 100644
index 0000000000000..76c4941de407d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-21-16c57348be42ca3cc2f80f7f92265696
@@ -0,0 +1 @@
+3556	47
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-22-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-22-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-22-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-23-2cdc77fd60449f3547cf95d8eb09a2 b/sql/hive/src/test/resources/golden/correlationoptimizer1-23-2cdc77fd60449f3547cf95d8eb09a2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-24-16c57348be42ca3cc2f80f7f92265696 b/sql/hive/src/test/resources/golden/correlationoptimizer1-24-16c57348be42ca3cc2f80f7f92265696
new file mode 100644
index 0000000000000..76c4941de407d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-24-16c57348be42ca3cc2f80f7f92265696
@@ -0,0 +1 @@
+3556	47
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-25-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-25-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-25-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-26-8bcdcc5f01508f576d7bd6422c939225 b/sql/hive/src/test/resources/golden/correlationoptimizer1-26-8bcdcc5f01508f576d7bd6422c939225
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-27-d31433f229e853e8b8440b4ddc63c80e b/sql/hive/src/test/resources/golden/correlationoptimizer1-27-d31433f229e853e8b8440b4ddc63c80e
new file mode 100644
index 0000000000000..76c4941de407d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-27-d31433f229e853e8b8440b4ddc63c80e
@@ -0,0 +1 @@
+3556	47
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-28-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-28-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-28-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-29-941ecfef9448ecff56cc16bcfb233ee4 b/sql/hive/src/test/resources/golden/correlationoptimizer1-29-941ecfef9448ecff56cc16bcfb233ee4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-3-5d712a42dcc1c4f7c797dabda5eb7b3a b/sql/hive/src/test/resources/golden/correlationoptimizer1-3-5d712a42dcc1c4f7c797dabda5eb7b3a
new file mode 100644
index 0000000000000..58010b0040b74
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-3-5d712a42dcc1c4f7c797dabda5eb7b3a
@@ -0,0 +1 @@
+3556	37
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-30-d31433f229e853e8b8440b4ddc63c80e b/sql/hive/src/test/resources/golden/correlationoptimizer1-30-d31433f229e853e8b8440b4ddc63c80e
new file mode 100644
index 0000000000000..76c4941de407d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-30-d31433f229e853e8b8440b4ddc63c80e
@@ -0,0 +1 @@
+3556	47
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-31-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-31-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-31-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-32-ef6502d6b282c8a6d228bba395b24724 b/sql/hive/src/test/resources/golden/correlationoptimizer1-32-ef6502d6b282c8a6d228bba395b24724
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-33-ea87e76dba02a46cb958148333e397b7 b/sql/hive/src/test/resources/golden/correlationoptimizer1-33-ea87e76dba02a46cb958148333e397b7
new file mode 100644
index 0000000000000..5aa2d482094af
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-33-ea87e76dba02a46cb958148333e397b7
@@ -0,0 +1 @@
+79136	500
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-34-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-34-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-34-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-35-b79b220859c09354e23b533c105ccbab b/sql/hive/src/test/resources/golden/correlationoptimizer1-35-b79b220859c09354e23b533c105ccbab
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-36-ea87e76dba02a46cb958148333e397b7 b/sql/hive/src/test/resources/golden/correlationoptimizer1-36-ea87e76dba02a46cb958148333e397b7
new file mode 100644
index 0000000000000..5aa2d482094af
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-36-ea87e76dba02a46cb958148333e397b7
@@ -0,0 +1 @@
+79136	500
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-37-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-37-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-37-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-38-638e5300f4c892c2bf27bd91a8f81b64 b/sql/hive/src/test/resources/golden/correlationoptimizer1-38-638e5300f4c892c2bf27bd91a8f81b64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-39-66010469a9cdb66851da9a727ef9fdad b/sql/hive/src/test/resources/golden/correlationoptimizer1-39-66010469a9cdb66851da9a727ef9fdad
new file mode 100644
index 0000000000000..b4a3a9d327f47
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-39-66010469a9cdb66851da9a727ef9fdad
@@ -0,0 +1 @@
+3556	500
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-4-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-4-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-4-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-40-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-40-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-40-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-41-3514c74c7f68f2d70cc6d51ac46c20 b/sql/hive/src/test/resources/golden/correlationoptimizer1-41-3514c74c7f68f2d70cc6d51ac46c20
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-42-66010469a9cdb66851da9a727ef9fdad b/sql/hive/src/test/resources/golden/correlationoptimizer1-42-66010469a9cdb66851da9a727ef9fdad
new file mode 100644
index 0000000000000..b4a3a9d327f47
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-42-66010469a9cdb66851da9a727ef9fdad
@@ -0,0 +1 @@
+3556	500
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-43-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-43-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-43-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-44-7490df6719cd7e47aa08dbcbc3266a92 b/sql/hive/src/test/resources/golden/correlationoptimizer1-44-7490df6719cd7e47aa08dbcbc3266a92
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-45-e71195e7d9f557e2abc7f03462d22dba b/sql/hive/src/test/resources/golden/correlationoptimizer1-45-e71195e7d9f557e2abc7f03462d22dba
new file mode 100644
index 0000000000000..bb564e0fd06eb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-45-e71195e7d9f557e2abc7f03462d22dba
@@ -0,0 +1 @@
+3556	510
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-46-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-46-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-46-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-47-73da9fe2b0c2ee26c021ec3f2fa27272 b/sql/hive/src/test/resources/golden/correlationoptimizer1-47-73da9fe2b0c2ee26c021ec3f2fa27272
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-48-e71195e7d9f557e2abc7f03462d22dba b/sql/hive/src/test/resources/golden/correlationoptimizer1-48-e71195e7d9f557e2abc7f03462d22dba
new file mode 100644
index 0000000000000..bb564e0fd06eb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-48-e71195e7d9f557e2abc7f03462d22dba
@@ -0,0 +1 @@
+3556	510
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-49-b1e2ade89ae898650f0be4f796d8947b b/sql/hive/src/test/resources/golden/correlationoptimizer1-49-b1e2ade89ae898650f0be4f796d8947b
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-49-b1e2ade89ae898650f0be4f796d8947b
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-5-a1c80c68b9a7597096c2580c3766f7f7 b/sql/hive/src/test/resources/golden/correlationoptimizer1-5-a1c80c68b9a7597096c2580c3766f7f7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-50-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-50-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-50-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-51-fcf9bcb522f542637ccdea863b408448 b/sql/hive/src/test/resources/golden/correlationoptimizer1-51-fcf9bcb522f542637ccdea863b408448
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-52-3070366869308907e54797927805603 b/sql/hive/src/test/resources/golden/correlationoptimizer1-52-3070366869308907e54797927805603
new file mode 100644
index 0000000000000..edd216f16b190
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-52-3070366869308907e54797927805603
@@ -0,0 +1 @@
+3556	661329102	37
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-53-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-53-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-53-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-54-dad56e1f06c808b29e5dc8fb0c49efb2 b/sql/hive/src/test/resources/golden/correlationoptimizer1-54-dad56e1f06c808b29e5dc8fb0c49efb2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-55-3070366869308907e54797927805603 b/sql/hive/src/test/resources/golden/correlationoptimizer1-55-3070366869308907e54797927805603
new file mode 100644
index 0000000000000..edd216f16b190
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-55-3070366869308907e54797927805603
@@ -0,0 +1 @@
+3556	661329102	37
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-56-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer1-56-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-56-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-57-3cd3fbbbd8ee5c274fe3d6a45126cef4 b/sql/hive/src/test/resources/golden/correlationoptimizer1-57-3cd3fbbbd8ee5c274fe3d6a45126cef4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-58-a6bba6d9b422adb386b35c62cecb548 b/sql/hive/src/test/resources/golden/correlationoptimizer1-58-a6bba6d9b422adb386b35c62cecb548
new file mode 100644
index 0000000000000..9fde6099b4f08
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-58-a6bba6d9b422adb386b35c62cecb548
@@ -0,0 +1 @@
+2835	29
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-59-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-59-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-59-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-6-5d712a42dcc1c4f7c797dabda5eb7b3a b/sql/hive/src/test/resources/golden/correlationoptimizer1-6-5d712a42dcc1c4f7c797dabda5eb7b3a
new file mode 100644
index 0000000000000..58010b0040b74
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-6-5d712a42dcc1c4f7c797dabda5eb7b3a
@@ -0,0 +1 @@
+3556	37
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-60-d6bbaf0d40010159095e4cac025c50c5 b/sql/hive/src/test/resources/golden/correlationoptimizer1-60-d6bbaf0d40010159095e4cac025c50c5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-61-a6bba6d9b422adb386b35c62cecb548 b/sql/hive/src/test/resources/golden/correlationoptimizer1-61-a6bba6d9b422adb386b35c62cecb548
new file mode 100644
index 0000000000000..9fde6099b4f08
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-61-a6bba6d9b422adb386b35c62cecb548
@@ -0,0 +1 @@
+2835	29
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-7-24ca942f094b14b92086305cc125e833 b/sql/hive/src/test/resources/golden/correlationoptimizer1-7-24ca942f094b14b92086305cc125e833
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-7-24ca942f094b14b92086305cc125e833
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-8-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer1-8-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer1-8-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer1-9-d5bea91b4edb8be0428a336ff9c21dde b/sql/hive/src/test/resources/golden/correlationoptimizer1-9-d5bea91b4edb8be0428a336ff9c21dde
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-0-b1e2ade89ae898650f0be4f796d8947b b/sql/hive/src/test/resources/golden/correlationoptimizer10-0-b1e2ade89ae898650f0be4f796d8947b
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-0-b1e2ade89ae898650f0be4f796d8947b
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-1-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer10-1-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-1-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-10-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer10-10-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-10-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-11-a1b7af95dfd01783c07aa23208d6160 b/sql/hive/src/test/resources/golden/correlationoptimizer10-11-a1b7af95dfd01783c07aa23208d6160
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-12-1322cff0bdf29aab32e638ad48c71ff9 b/sql/hive/src/test/resources/golden/correlationoptimizer10-12-1322cff0bdf29aab32e638ad48c71ff9
new file mode 100644
index 0000000000000..9ea431d9f5d18
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-12-1322cff0bdf29aab32e638ad48c71ff9
@@ -0,0 +1,5 @@
+66	val_66
+98	val_98
+128	
+146	val_146
+150	val_150
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-13-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer10-13-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-13-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-14-934b668c11600dc9c013c2ddc4c0d68c b/sql/hive/src/test/resources/golden/correlationoptimizer10-14-934b668c11600dc9c013c2ddc4c0d68c
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-15-430ff20a144fb3dbf526232d9cb2baa b/sql/hive/src/test/resources/golden/correlationoptimizer10-15-430ff20a144fb3dbf526232d9cb2baa
new file mode 100644
index 0000000000000..5abecc5df25fd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-15-430ff20a144fb3dbf526232d9cb2baa
@@ -0,0 +1,23 @@
+181	val_181
+183	val_183
+186	val_186
+187	val_187
+187	val_187
+187	val_187
+189	val_189
+190	val_190
+191	val_191
+191	val_191
+192	val_192
+193	val_193
+193	val_193
+193	val_193
+194	val_194
+195	val_195
+195	val_195
+196	val_196
+197	val_197
+197	val_197
+199	val_199
+199	val_199
+199	val_199
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-16-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer10-16-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-16-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-17-9d5521ecef1353d23fd2d4f7d78e7006 b/sql/hive/src/test/resources/golden/correlationoptimizer10-17-9d5521ecef1353d23fd2d4f7d78e7006
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-18-430ff20a144fb3dbf526232d9cb2baa b/sql/hive/src/test/resources/golden/correlationoptimizer10-18-430ff20a144fb3dbf526232d9cb2baa
new file mode 100644
index 0000000000000..5abecc5df25fd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-18-430ff20a144fb3dbf526232d9cb2baa
@@ -0,0 +1,23 @@
+181	val_181
+183	val_183
+186	val_186
+187	val_187
+187	val_187
+187	val_187
+189	val_189
+190	val_190
+191	val_191
+191	val_191
+192	val_192
+193	val_193
+193	val_193
+193	val_193
+194	val_194
+195	val_195
+195	val_195
+196	val_196
+197	val_197
+197	val_197
+199	val_199
+199	val_199
+199	val_199
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-2-f9de06a4184ab1f42793327c1497437 b/sql/hive/src/test/resources/golden/correlationoptimizer10-2-f9de06a4184ab1f42793327c1497437
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-3-6a01aa7ca94cda4268af894b4fd852ea b/sql/hive/src/test/resources/golden/correlationoptimizer10-3-6a01aa7ca94cda4268af894b4fd852ea
new file mode 100644
index 0000000000000..d00aeb4be0340
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-3-6a01aa7ca94cda4268af894b4fd852ea
@@ -0,0 +1,15 @@
+66	1
+98	1
+128	1
+146	1
+150	1
+213	1
+224	1
+238	1
+255	1
+273	1
+278	1
+311	1
+369	1
+401	1
+406	1
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-4-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer10-4-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-4-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-5-6f191930802d659058465b2e6de08dd3 b/sql/hive/src/test/resources/golden/correlationoptimizer10-5-6f191930802d659058465b2e6de08dd3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-6-6a01aa7ca94cda4268af894b4fd852ea b/sql/hive/src/test/resources/golden/correlationoptimizer10-6-6a01aa7ca94cda4268af894b4fd852ea
new file mode 100644
index 0000000000000..d00aeb4be0340
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-6-6a01aa7ca94cda4268af894b4fd852ea
@@ -0,0 +1,15 @@
+66	1
+98	1
+128	1
+146	1
+150	1
+213	1
+224	1
+238	1
+255	1
+273	1
+278	1
+311	1
+369	1
+401	1
+406	1
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-7-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer10-7-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-7-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-8-b2c8d0056b9f1b41cdaeaab4a1a5c9ec b/sql/hive/src/test/resources/golden/correlationoptimizer10-8-b2c8d0056b9f1b41cdaeaab4a1a5c9ec
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer10-9-1322cff0bdf29aab32e638ad48c71ff9 b/sql/hive/src/test/resources/golden/correlationoptimizer10-9-1322cff0bdf29aab32e638ad48c71ff9
new file mode 100644
index 0000000000000..9ea431d9f5d18
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer10-9-1322cff0bdf29aab32e638ad48c71ff9
@@ -0,0 +1,5 @@
+66	val_66
+98	val_98
+128	
+146	val_146
+150	val_150
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer13-0-efd135a811fa94760736a761d220b82 b/sql/hive/src/test/resources/golden/correlationoptimizer13-0-efd135a811fa94760736a761d220b82
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer13-1-32a82500cc28465fac6f64dde0c431c6 b/sql/hive/src/test/resources/golden/correlationoptimizer13-1-32a82500cc28465fac6f64dde0c431c6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer13-2-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer13-2-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer13-2-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer13-3-bb61d9292434f37bd386e5bff683764d b/sql/hive/src/test/resources/golden/correlationoptimizer13-3-bb61d9292434f37bd386e5bff683764d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-0-efd135a811fa94760736a761d220b82 b/sql/hive/src/test/resources/golden/correlationoptimizer9-0-efd135a811fa94760736a761d220b82
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-1-b1e2ade89ae898650f0be4f796d8947b b/sql/hive/src/test/resources/golden/correlationoptimizer9-1-b1e2ade89ae898650f0be4f796d8947b
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-1-b1e2ade89ae898650f0be4f796d8947b
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-10-1190d82f88f7fb1f91968f6e2e03772a b/sql/hive/src/test/resources/golden/correlationoptimizer9-10-1190d82f88f7fb1f91968f6e2e03772a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-11-bc2ae88b17ac2bdbd288e07194a40168 b/sql/hive/src/test/resources/golden/correlationoptimizer9-11-bc2ae88b17ac2bdbd288e07194a40168
new file mode 100644
index 0000000000000..17c838bb62b3b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-11-bc2ae88b17ac2bdbd288e07194a40168
@@ -0,0 +1,9 @@
+103	val_103	103	val_103	4	4
+104	val_104	104	val_104	4	4
+105	val_105	105	val_105	1	1
+111	val_111	111	val_111	1	1
+113	val_113	113	val_113	4	4
+114	val_114	114	val_114	1	1
+116	val_116	116	val_116	1	1
+118	val_118	118	val_118	4	4
+119	val_119	119	val_119	9	9
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-12-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer9-12-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-12-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-13-1190d82f88f7fb1f91968f6e2e03772a b/sql/hive/src/test/resources/golden/correlationoptimizer9-13-1190d82f88f7fb1f91968f6e2e03772a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-14-bc2ae88b17ac2bdbd288e07194a40168 b/sql/hive/src/test/resources/golden/correlationoptimizer9-14-bc2ae88b17ac2bdbd288e07194a40168
new file mode 100644
index 0000000000000..17c838bb62b3b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-14-bc2ae88b17ac2bdbd288e07194a40168
@@ -0,0 +1,9 @@
+103	val_103	103	val_103	4	4
+104	val_104	104	val_104	4	4
+105	val_105	105	val_105	1	1
+111	val_111	111	val_111	1	1
+113	val_113	113	val_113	4	4
+114	val_114	114	val_114	1	1
+116	val_116	116	val_116	1	1
+118	val_118	118	val_118	4	4
+119	val_119	119	val_119	9	9
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-2-32a82500cc28465fac6f64dde0c431c6 b/sql/hive/src/test/resources/golden/correlationoptimizer9-2-32a82500cc28465fac6f64dde0c431c6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-3-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer9-3-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-3-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-4-ec131bcf578dba99f20b16a7dc6b9b b/sql/hive/src/test/resources/golden/correlationoptimizer9-4-ec131bcf578dba99f20b16a7dc6b9b
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-5-b4e378104bb5ab8d8ba5f905aa1ff450 b/sql/hive/src/test/resources/golden/correlationoptimizer9-5-b4e378104bb5ab8d8ba5f905aa1ff450
new file mode 100644
index 0000000000000..248a14f1f4a9f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-5-b4e378104bb5ab8d8ba5f905aa1ff450
@@ -0,0 +1,9 @@
+103	103	4	4
+104	104	4	4
+105	105	1	1
+111	111	1	1
+113	113	4	4
+114	114	1	1
+116	116	1	1
+118	118	4	4
+119	119	9	9
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-6-777edd9d575f3480ca6cebe4be57b1f6 b/sql/hive/src/test/resources/golden/correlationoptimizer9-6-777edd9d575f3480ca6cebe4be57b1f6
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-6-777edd9d575f3480ca6cebe4be57b1f6
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-7-f952899d70bd718cbdbc44a5290938c9 b/sql/hive/src/test/resources/golden/correlationoptimizer9-7-f952899d70bd718cbdbc44a5290938c9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-8-b4e378104bb5ab8d8ba5f905aa1ff450 b/sql/hive/src/test/resources/golden/correlationoptimizer9-8-b4e378104bb5ab8d8ba5f905aa1ff450
new file mode 100644
index 0000000000000..248a14f1f4a9f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-8-b4e378104bb5ab8d8ba5f905aa1ff450
@@ -0,0 +1,9 @@
+103	103	4	4
+104	104	4	4
+105	105	1	1
+111	111	1	1
+113	113	4	4
+114	114	1	1
+116	116	1	1
+118	118	4	4
+119	119	9	9
diff --git a/sql/hive/src/test/resources/golden/correlationoptimizer9-9-b9d963d24994c47c3776dda6f7d3881f b/sql/hive/src/test/resources/golden/correlationoptimizer9-9-b9d963d24994c47c3776dda6f7d3881f
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/correlationoptimizer9-9-b9d963d24994c47c3776dda6f7d3881f
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/div-0-3760f9b354ddacd7c7b01b28791d4585 b/sql/hive/src/test/resources/golden/div-0-3760f9b354ddacd7c7b01b28791d4585
new file mode 100644
index 0000000000000..17ba0bea723c6
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/div-0-3760f9b354ddacd7c7b01b28791d4585
@@ -0,0 +1 @@
+0	0	0	1	2
diff --git a/sql/hive/src/test/resources/golden/division-0-63b19f8a22471c8ba0415c1d3bc276f7 b/sql/hive/src/test/resources/golden/division-0-63b19f8a22471c8ba0415c1d3bc276f7
new file mode 100644
index 0000000000000..7b7a9175114ce
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/division-0-63b19f8a22471c8ba0415c1d3bc276f7
@@ -0,0 +1 @@
+2.0	0.5	0.3333333333333333	0.002
diff --git a/sql/hive/src/test/resources/golden/double case-0-f513687d17dcb18546fefa75000a52f2 b/sql/hive/src/test/resources/golden/double case-0-f513687d17dcb18546fefa75000a52f2
new file mode 100644
index 0000000000000..00750edc07d64
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/double case-0-f513687d17dcb18546fefa75000a52f2	
@@ -0,0 +1 @@
+3
diff --git a/sql/hive/src/test/resources/golden/groupby_ppd-0-4b116ec2d8fb55c52b7b0d248c616ae2 b/sql/hive/src/test/resources/golden/groupby_ppd-0-4b116ec2d8fb55c52b7b0d248c616ae2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_ppd-1-db7b1db8f5e61f0fa78d2874e4d72d9d b/sql/hive/src/test/resources/golden/groupby_ppd-1-db7b1db8f5e61f0fa78d2874e4d72d9d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/groupby_ppd-2-d286410aa1d5f5c8d91b863a6d6e29c5 b/sql/hive/src/test/resources/golden/groupby_ppd-2-d286410aa1d5f5c8d91b863a6d6e29c5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/having no references-0-d2de3ba23759d25ef77cdfbab72cbb63 b/sql/hive/src/test/resources/golden/having no references-0-d2de3ba23759d25ef77cdfbab72cbb63
new file mode 100644
index 0000000000000..3f2cab688ccc2
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/having no references-0-d2de3ba23759d25ef77cdfbab72cbb63	
@@ -0,0 +1,136 @@
+0
+5
+12
+15
+18
+24
+26
+35
+37
+42
+51
+58
+67
+70
+72
+76
+83
+84
+90
+95
+97
+98
+100
+103
+104
+113
+118
+119
+120
+125
+128
+129
+134
+137
+138
+146
+149
+152
+164
+165
+167
+169
+172
+174
+175
+176
+179
+187
+191
+193
+195
+197
+199
+200
+203
+205
+207
+208
+209
+213
+216
+217
+219
+221
+223
+224
+229
+230
+233
+237
+238
+239
+242
+255
+256
+265
+272
+273
+277
+278
+280
+281
+282
+288
+298
+307
+309
+311
+316
+317
+318
+321
+322
+325
+327
+331
+333
+342
+344
+348
+353
+367
+369
+382
+384
+395
+396
+397
+399
+401
+403
+404
+406
+409
+413
+414
+417
+424
+429
+430
+431
+438
+439
+454
+458
+459
+462
+463
+466
+468
+469
+478
+480
+489
+492
+498
diff --git a/sql/hive/src/test/resources/golden/having-0-57f3f26c0203c29c2a91a7cca557ce55 b/sql/hive/src/test/resources/golden/having-0-57f3f26c0203c29c2a91a7cca557ce55
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/having-1-ef81808faeab6d212c3cf32abfc0d873 b/sql/hive/src/test/resources/golden/having-1-ef81808faeab6d212c3cf32abfc0d873
new file mode 100644
index 0000000000000..704f1e62f14c5
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/having-1-ef81808faeab6d212c3cf32abfc0d873
@@ -0,0 +1,10 @@
+4
+4
+5
+4
+5
+5
+4
+4
+5
+4
diff --git a/sql/hive/src/test/resources/golden/having-2-a2b4f52cb92f730ddb912b063636d6c1 b/sql/hive/src/test/resources/golden/having-2-a2b4f52cb92f730ddb912b063636d6c1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/having-3-3fa6387b6a4ece110ac340c7b893964e b/sql/hive/src/test/resources/golden/having-3-3fa6387b6a4ece110ac340c7b893964e
new file mode 100644
index 0000000000000..b56757a60f780
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/having-3-3fa6387b6a4ece110ac340c7b893964e
@@ -0,0 +1,308 @@
+0	val_0
+2	val_2
+4	val_4
+5	val_5
+8	val_8
+9	val_9
+10	val_10
+11	val_11
+12	val_12
+15	val_15
+17	val_17
+18	val_18
+19	val_19
+20	val_20
+24	val_24
+26	val_26
+27	val_27
+28	val_28
+30	val_30
+33	val_33
+34	val_34
+35	val_35
+37	val_37
+41	val_41
+42	val_42
+43	val_43
+44	val_44
+47	val_47
+51	val_51
+53	val_53
+54	val_54
+57	val_57
+58	val_58
+64	val_64
+65	val_65
+66	val_66
+67	val_67
+69	val_69
+70	val_70
+72	val_72
+74	val_74
+76	val_76
+77	val_77
+78	val_78
+80	val_80
+82	val_82
+83	val_83
+84	val_84
+85	val_85
+86	val_86
+87	val_87
+90	val_90
+92	val_92
+95	val_95
+96	val_96
+97	val_97
+98	val_98
+100	val_100
+103	val_103
+104	val_104
+105	val_105
+111	val_111
+113	val_113
+114	val_114
+116	val_116
+118	val_118
+119	val_119
+120	val_120
+125	val_125
+126	val_126
+128	val_128
+129	val_129
+131	val_131
+133	val_133
+134	val_134
+136	val_136
+137	val_137
+138	val_138
+143	val_143
+145	val_145
+146	val_146
+149	val_149
+150	val_150
+152	val_152
+153	val_153
+155	val_155
+156	val_156
+157	val_157
+158	val_158
+160	val_160
+162	val_162
+163	val_163
+164	val_164
+165	val_165
+166	val_166
+167	val_167
+168	val_168
+169	val_169
+170	val_170
+172	val_172
+174	val_174
+175	val_175
+176	val_176
+177	val_177
+178	val_178
+179	val_179
+180	val_180
+181	val_181
+183	val_183
+186	val_186
+187	val_187
+189	val_189
+190	val_190
+191	val_191
+192	val_192
+193	val_193
+194	val_194
+195	val_195
+196	val_196
+197	val_197
+199	val_199
+200	val_200
+201	val_201
+202	val_202
+203	val_203
+205	val_205
+207	val_207
+208	val_208
+209	val_209
+213	val_213
+214	val_214
+216	val_216
+217	val_217
+218	val_218
+219	val_219
+221	val_221
+222	val_222
+223	val_223
+224	val_224
+226	val_226
+228	val_228
+229	val_229
+230	val_230
+233	val_233
+235	val_235
+237	val_237
+238	val_238
+239	val_239
+241	val_241
+242	val_242
+244	val_244
+247	val_247
+248	val_248
+249	val_249
+252	val_252
+255	val_255
+256	val_256
+257	val_257
+258	val_258
+260	val_260
+262	val_262
+263	val_263
+265	val_265
+266	val_266
+272	val_272
+273	val_273
+274	val_274
+275	val_275
+277	val_277
+278	val_278
+280	val_280
+281	val_281
+282	val_282
+283	val_283
+284	val_284
+285	val_285
+286	val_286
+287	val_287
+288	val_288
+289	val_289
+291	val_291
+292	val_292
+296	val_296
+298	val_298
+305	val_305
+306	val_306
+307	val_307
+308	val_308
+309	val_309
+310	val_310
+311	val_311
+315	val_315
+316	val_316
+317	val_317
+318	val_318
+321	val_321
+322	val_322
+323	val_323
+325	val_325
+327	val_327
+331	val_331
+332	val_332
+333	val_333
+335	val_335
+336	val_336
+338	val_338
+339	val_339
+341	val_341
+342	val_342
+344	val_344
+345	val_345
+348	val_348
+351	val_351
+353	val_353
+356	val_356
+360	val_360
+362	val_362
+364	val_364
+365	val_365
+366	val_366
+367	val_367
+368	val_368
+369	val_369
+373	val_373
+374	val_374
+375	val_375
+377	val_377
+378	val_378
+379	val_379
+382	val_382
+384	val_384
+386	val_386
+389	val_389
+392	val_392
+393	val_393
+394	val_394
+395	val_395
+396	val_396
+397	val_397
+399	val_399
+400	val_400
+401	val_401
+402	val_402
+403	val_403
+404	val_404
+406	val_406
+407	val_407
+409	val_409
+411	val_411
+413	val_413
+414	val_414
+417	val_417
+418	val_418
+419	val_419
+421	val_421
+424	val_424
+427	val_427
+429	val_429
+430	val_430
+431	val_431
+432	val_432
+435	val_435
+436	val_436
+437	val_437
+438	val_438
+439	val_439
+443	val_443
+444	val_444
+446	val_446
+448	val_448
+449	val_449
+452	val_452
+453	val_453
+454	val_454
+455	val_455
+457	val_457
+458	val_458
+459	val_459
+460	val_460
+462	val_462
+463	val_463
+466	val_466
+467	val_467
+468	val_468
+469	val_469
+470	val_470
+472	val_472
+475	val_475
+477	val_477
+478	val_478
+479	val_479
+480	val_480
+481	val_481
+482	val_482
+483	val_483
+484	val_484
+485	val_485
+487	val_487
+489	val_489
+490	val_490
+491	val_491
+492	val_492
+493	val_493
+494	val_494
+495	val_495
+496	val_496
+497	val_497
+498	val_498
diff --git a/sql/hive/src/test/resources/golden/having-4-e9918bd385cb35db4ebcbd4e398547f4 b/sql/hive/src/test/resources/golden/having-4-e9918bd385cb35db4ebcbd4e398547f4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/having-5-4a0c4e521b8a6f6146151c13a2715ff b/sql/hive/src/test/resources/golden/having-5-4a0c4e521b8a6f6146151c13a2715ff
new file mode 100644
index 0000000000000..2d7022e386303
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/having-5-4a0c4e521b8a6f6146151c13a2715ff
@@ -0,0 +1,199 @@
+4
+5
+8
+9
+26
+27
+28
+30
+33
+34
+35
+37
+41
+42
+43
+44
+47
+51
+53
+54
+57
+58
+64
+65
+66
+67
+69
+70
+72
+74
+76
+77
+78
+80
+82
+83
+84
+85
+86
+87
+90
+92
+95
+96
+97
+98
+256
+257
+258
+260
+262
+263
+265
+266
+272
+273
+274
+275
+277
+278
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+291
+292
+296
+298
+302
+305
+306
+307
+308
+309
+310
+311
+315
+316
+317
+318
+321
+322
+323
+325
+327
+331
+332
+333
+335
+336
+338
+339
+341
+342
+344
+345
+348
+351
+353
+356
+360
+362
+364
+365
+366
+367
+368
+369
+373
+374
+375
+377
+378
+379
+382
+384
+386
+389
+392
+393
+394
+395
+396
+397
+399
+400
+401
+402
+403
+404
+406
+407
+409
+411
+413
+414
+417
+418
+419
+421
+424
+427
+429
+430
+431
+432
+435
+436
+437
+438
+439
+443
+444
+446
+448
+449
+452
+453
+454
+455
+457
+458
+459
+460
+462
+463
+466
+467
+468
+469
+470
+472
+475
+477
+478
+479
+480
+481
+482
+483
+484
+485
+487
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
diff --git a/sql/hive/src/test/resources/golden/having-6-9f50df5b5f31c7166b0396ab434dc095 b/sql/hive/src/test/resources/golden/having-6-9f50df5b5f31c7166b0396ab434dc095
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/having-7-5ad96cb287df02080da1e2594f08d83e b/sql/hive/src/test/resources/golden/having-7-5ad96cb287df02080da1e2594f08d83e
new file mode 100644
index 0000000000000..bd545ccf7430c
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/having-7-5ad96cb287df02080da1e2594f08d83e
@@ -0,0 +1,125 @@
+302
+305
+306
+307
+308
+309
+310
+311
+315
+316
+317
+318
+321
+322
+323
+325
+327
+331
+332
+333
+335
+336
+338
+339
+341
+342
+344
+345
+348
+351
+353
+356
+360
+362
+364
+365
+366
+367
+368
+369
+373
+374
+375
+377
+378
+379
+382
+384
+386
+389
+392
+393
+394
+395
+396
+397
+399
+400
+401
+402
+403
+404
+406
+407
+409
+411
+413
+414
+417
+418
+419
+421
+424
+427
+429
+430
+431
+432
+435
+436
+437
+438
+439
+443
+444
+446
+448
+449
+452
+453
+454
+455
+457
+458
+459
+460
+462
+463
+466
+467
+468
+469
+470
+472
+475
+477
+478
+479
+480
+481
+482
+483
+484
+485
+487
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
diff --git a/sql/hive/src/test/resources/golden/having-8-4aa7197e20b5a64461ca670a79488103 b/sql/hive/src/test/resources/golden/having-8-4aa7197e20b5a64461ca670a79488103
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/having-9-a79743372d86d77b0ff53a71adcb1cff b/sql/hive/src/test/resources/golden/having-9-a79743372d86d77b0ff53a71adcb1cff
new file mode 100644
index 0000000000000..d77586c12b6af
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/having-9-a79743372d86d77b0ff53a71adcb1cff
@@ -0,0 +1,199 @@
+4	val_4
+5	val_5
+8	val_8
+9	val_9
+26	val_26
+27	val_27
+28	val_28
+30	val_30
+33	val_33
+34	val_34
+35	val_35
+37	val_37
+41	val_41
+42	val_42
+43	val_43
+44	val_44
+47	val_47
+51	val_51
+53	val_53
+54	val_54
+57	val_57
+58	val_58
+64	val_64
+65	val_65
+66	val_66
+67	val_67
+69	val_69
+70	val_70
+72	val_72
+74	val_74
+76	val_76
+77	val_77
+78	val_78
+80	val_80
+82	val_82
+83	val_83
+84	val_84
+85	val_85
+86	val_86
+87	val_87
+90	val_90
+92	val_92
+95	val_95
+96	val_96
+97	val_97
+98	val_98
+256	val_256
+257	val_257
+258	val_258
+260	val_260
+262	val_262
+263	val_263
+265	val_265
+266	val_266
+272	val_272
+273	val_273
+274	val_274
+275	val_275
+277	val_277
+278	val_278
+280	val_280
+281	val_281
+282	val_282
+283	val_283
+284	val_284
+285	val_285
+286	val_286
+287	val_287
+288	val_288
+289	val_289
+291	val_291
+292	val_292
+296	val_296
+298	val_298
+302	val_302
+305	val_305
+306	val_306
+307	val_307
+308	val_308
+309	val_309
+310	val_310
+311	val_311
+315	val_315
+316	val_316
+317	val_317
+318	val_318
+321	val_321
+322	val_322
+323	val_323
+325	val_325
+327	val_327
+331	val_331
+332	val_332
+333	val_333
+335	val_335
+336	val_336
+338	val_338
+339	val_339
+341	val_341
+342	val_342
+344	val_344
+345	val_345
+348	val_348
+351	val_351
+353	val_353
+356	val_356
+360	val_360
+362	val_362
+364	val_364
+365	val_365
+366	val_366
+367	val_367
+368	val_368
+369	val_369
+373	val_373
+374	val_374
+375	val_375
+377	val_377
+378	val_378
+379	val_379
+382	val_382
+384	val_384
+386	val_386
+389	val_389
+392	val_392
+393	val_393
+394	val_394
+395	val_395
+396	val_396
+397	val_397
+399	val_399
+400	val_400
+401	val_401
+402	val_402
+403	val_403
+404	val_404
+406	val_406
+407	val_407
+409	val_409
+411	val_411
+413	val_413
+414	val_414
+417	val_417
+418	val_418
+419	val_419
+421	val_421
+424	val_424
+427	val_427
+429	val_429
+430	val_430
+431	val_431
+432	val_432
+435	val_435
+436	val_436
+437	val_437
+438	val_438
+439	val_439
+443	val_443
+444	val_444
+446	val_446
+448	val_448
+449	val_449
+452	val_452
+453	val_453
+454	val_454
+455	val_455
+457	val_457
+458	val_458
+459	val_459
+460	val_460
+462	val_462
+463	val_463
+466	val_466
+467	val_467
+468	val_468
+469	val_469
+470	val_470
+472	val_472
+475	val_475
+477	val_477
+478	val_478
+479	val_479
+480	val_480
+481	val_481
+482	val_482
+483	val_483
+484	val_484
+485	val_485
+487	val_487
+489	val_489
+490	val_490
+491	val_491
+492	val_492
+493	val_493
+494	val_494
+495	val_495
+496	val_496
+497	val_497
+498	val_498
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-0-869726b703f160eabdb7763700b53e60 b/sql/hive/src/test/resources/golden/join_nullsafe-0-869726b703f160eabdb7763700b53e60
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-0-869726b703f160eabdb7763700b53e60
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-1-5644ab44e5ba9f2941216b8d5dc33a99 b/sql/hive/src/test/resources/golden/join_nullsafe-1-5644ab44e5ba9f2941216b8d5dc33a99
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-10-b6de4e85dcc1d1949c7431d39fa1b919 b/sql/hive/src/test/resources/golden/join_nullsafe-10-b6de4e85dcc1d1949c7431d39fa1b919
new file mode 100644
index 0000000000000..31c409082cc2f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-10-b6de4e85dcc1d1949c7431d39fa1b919
@@ -0,0 +1,2 @@
+NULL	10	10	NULL	NULL	10
+100	100	100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-11-3aa243002a5363b84556736ef71613b1 b/sql/hive/src/test/resources/golden/join_nullsafe-11-3aa243002a5363b84556736ef71613b1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-12-3cc55b14e8256d2c51361b61986c291e b/sql/hive/src/test/resources/golden/join_nullsafe-12-3cc55b14e8256d2c51361b61986c291e
new file mode 100644
index 0000000000000..9b77d13cbaab2
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-12-3cc55b14e8256d2c51361b61986c291e
@@ -0,0 +1,4 @@
+NULL	NULL	NULL	NULL	NULL	NULL
+NULL	10	10	NULL	NULL	10
+10	NULL	NULL	10	10	NULL
+100	100	100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-13-69d94e229191e7b9b1a3e7eae46eb993 b/sql/hive/src/test/resources/golden/join_nullsafe-13-69d94e229191e7b9b1a3e7eae46eb993
new file mode 100644
index 0000000000000..47c0709d39851
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-13-69d94e229191e7b9b1a3e7eae46eb993
@@ -0,0 +1,12 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+10	NULL	NULL	10
+48	NULL	NULL	NULL
+100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-14-cf9ff6ee72a701a8e2f3e7fb0667903c b/sql/hive/src/test/resources/golden/join_nullsafe-14-cf9ff6ee72a701a8e2f3e7fb0667903c
new file mode 100644
index 0000000000000..36ba48516b658
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-14-cf9ff6ee72a701a8e2f3e7fb0667903c
@@ -0,0 +1,12 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	35
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+10	NULL	NULL	10
+100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-15-507d0fa6d7ce39e2d9921555cea6f8da b/sql/hive/src/test/resources/golden/join_nullsafe-15-507d0fa6d7ce39e2d9921555cea6f8da
new file mode 100644
index 0000000000000..fc1fd198cf8be
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-15-507d0fa6d7ce39e2d9921555cea6f8da
@@ -0,0 +1,13 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	35
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+10	NULL	NULL	10
+48	NULL	NULL	NULL
+100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-16-1c714fc339304de4db630530e5d1ce97 b/sql/hive/src/test/resources/golden/join_nullsafe-16-1c714fc339304de4db630530e5d1ce97
new file mode 100644
index 0000000000000..1cc70524f9d6d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-16-1c714fc339304de4db630530e5d1ce97
@@ -0,0 +1,11 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+10	NULL	NULL	10
+100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-17-8a4b0dc781a28ad11a0db9805fe03aa8 b/sql/hive/src/test/resources/golden/join_nullsafe-17-8a4b0dc781a28ad11a0db9805fe03aa8
new file mode 100644
index 0000000000000..1cc70524f9d6d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-17-8a4b0dc781a28ad11a0db9805fe03aa8
@@ -0,0 +1,11 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+10	NULL	NULL	10
+100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-18-10b2051e65cac50ee1ea1c138ec192c8 b/sql/hive/src/test/resources/golden/join_nullsafe-18-10b2051e65cac50ee1ea1c138ec192c8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-19-23ab7ac8229a53d391195be7ca092429 b/sql/hive/src/test/resources/golden/join_nullsafe-19-23ab7ac8229a53d391195be7ca092429
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-2-793e288c9e0971f0bf3f37493f76dc7 b/sql/hive/src/test/resources/golden/join_nullsafe-2-793e288c9e0971f0bf3f37493f76dc7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-20-d6fc260320c577eec9a5db0d4135d224 b/sql/hive/src/test/resources/golden/join_nullsafe-20-d6fc260320c577eec9a5db0d4135d224
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-21-a60dae725ffc543f805242611d99de4e b/sql/hive/src/test/resources/golden/join_nullsafe-21-a60dae725ffc543f805242611d99de4e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-22-24c80d0f9e3d72c48d947770fa184985 b/sql/hive/src/test/resources/golden/join_nullsafe-22-24c80d0f9e3d72c48d947770fa184985
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-23-3fe6ae20cab3417759dcc654a3a26746 b/sql/hive/src/test/resources/golden/join_nullsafe-23-3fe6ae20cab3417759dcc654a3a26746
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-24-2db30531137611e06fdba478ca7a8412 b/sql/hive/src/test/resources/golden/join_nullsafe-24-2db30531137611e06fdba478ca7a8412
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-24-2db30531137611e06fdba478ca7a8412
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-25-e58b2754e8d9c56a473557a549d0d2b9 b/sql/hive/src/test/resources/golden/join_nullsafe-25-e58b2754e8d9c56a473557a549d0d2b9
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-25-e58b2754e8d9c56a473557a549d0d2b9
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-26-64cabe5164130a94f387288f37b62d71 b/sql/hive/src/test/resources/golden/join_nullsafe-26-64cabe5164130a94f387288f37b62d71
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-26-64cabe5164130a94f387288f37b62d71
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-27-e8ed4a1b574a6ca70cbfb3f7b9980aa6 b/sql/hive/src/test/resources/golden/join_nullsafe-27-e8ed4a1b574a6ca70cbfb3f7b9980aa6
new file mode 100644
index 0000000000000..66482299904bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-27-e8ed4a1b574a6ca70cbfb3f7b9980aa6
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	10
+NULL	NULL	NULL	10
+NULL	NULL	NULL	35
+NULL	NULL	NULL	35
+NULL	NULL	NULL	110
+NULL	NULL	NULL	110
+NULL	NULL	NULL	135
+NULL	NULL	NULL	135
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	10
+NULL	10	NULL	35
+NULL	10	NULL	110
+NULL	10	NULL	135
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	10
+NULL	35	NULL	35
+NULL	35	NULL	110
+NULL	35	NULL	135
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	10
+NULL	110	NULL	35
+NULL	110	NULL	110
+NULL	110	NULL	135
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	10
+NULL	135	NULL	35
+NULL	135	NULL	110
+NULL	135	NULL	135
+10	NULL	10	NULL
+48	NULL	48	NULL
+100	100	100	100
+110	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-28-5a0c946cd7033857ca99e5fb800f8525 b/sql/hive/src/test/resources/golden/join_nullsafe-28-5a0c946cd7033857ca99e5fb800f8525
new file mode 100644
index 0000000000000..2efbef0484452
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-28-5a0c946cd7033857ca99e5fb800f8525
@@ -0,0 +1,14 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	10	NULL	10
+NULL	35	NULL	35
+NULL	110	NULL	110
+NULL	135	NULL	135
+10	NULL	10	NULL
+48	NULL	48	NULL
+100	100	100	100
+110	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-29-514043c2ddaf6ea8f16a764adc92d1cf b/sql/hive/src/test/resources/golden/join_nullsafe-29-514043c2ddaf6ea8f16a764adc92d1cf
new file mode 100644
index 0000000000000..66482299904bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-29-514043c2ddaf6ea8f16a764adc92d1cf
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	10
+NULL	NULL	NULL	10
+NULL	NULL	NULL	35
+NULL	NULL	NULL	35
+NULL	NULL	NULL	110
+NULL	NULL	NULL	110
+NULL	NULL	NULL	135
+NULL	NULL	NULL	135
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	10
+NULL	10	NULL	35
+NULL	10	NULL	110
+NULL	10	NULL	135
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	10
+NULL	35	NULL	35
+NULL	35	NULL	110
+NULL	35	NULL	135
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	10
+NULL	110	NULL	35
+NULL	110	NULL	110
+NULL	110	NULL	135
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	10
+NULL	135	NULL	35
+NULL	135	NULL	110
+NULL	135	NULL	135
+10	NULL	10	NULL
+48	NULL	48	NULL
+100	100	100	100
+110	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-3-ae378fc0f875a21884e58fa35a6d52cd b/sql/hive/src/test/resources/golden/join_nullsafe-3-ae378fc0f875a21884e58fa35a6d52cd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-30-fcbf92cb1b85ab01102fbbc6caba9a88 b/sql/hive/src/test/resources/golden/join_nullsafe-30-fcbf92cb1b85ab01102fbbc6caba9a88
new file mode 100644
index 0000000000000..66482299904bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-30-fcbf92cb1b85ab01102fbbc6caba9a88
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	10
+NULL	NULL	NULL	10
+NULL	NULL	NULL	35
+NULL	NULL	NULL	35
+NULL	NULL	NULL	110
+NULL	NULL	NULL	110
+NULL	NULL	NULL	135
+NULL	NULL	NULL	135
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	10
+NULL	10	NULL	35
+NULL	10	NULL	110
+NULL	10	NULL	135
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	10
+NULL	35	NULL	35
+NULL	35	NULL	110
+NULL	35	NULL	135
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	10
+NULL	110	NULL	35
+NULL	110	NULL	110
+NULL	110	NULL	135
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	10
+NULL	135	NULL	35
+NULL	135	NULL	110
+NULL	135	NULL	135
+10	NULL	10	NULL
+48	NULL	48	NULL
+100	100	100	100
+110	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-31-1cb03e1106f79d14f22bc89d386cedcf b/sql/hive/src/test/resources/golden/join_nullsafe-31-1cb03e1106f79d14f22bc89d386cedcf
new file mode 100644
index 0000000000000..66482299904bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-31-1cb03e1106f79d14f22bc89d386cedcf
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	10
+NULL	NULL	NULL	10
+NULL	NULL	NULL	35
+NULL	NULL	NULL	35
+NULL	NULL	NULL	110
+NULL	NULL	NULL	110
+NULL	NULL	NULL	135
+NULL	NULL	NULL	135
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	10
+NULL	10	NULL	35
+NULL	10	NULL	110
+NULL	10	NULL	135
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	10
+NULL	35	NULL	35
+NULL	35	NULL	110
+NULL	35	NULL	135
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	10
+NULL	110	NULL	35
+NULL	110	NULL	110
+NULL	110	NULL	135
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	10
+NULL	135	NULL	35
+NULL	135	NULL	110
+NULL	135	NULL	135
+10	NULL	10	NULL
+48	NULL	48	NULL
+100	100	100	100
+110	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-32-6a0bf6127d4b042e67ae8ee15125fb87 b/sql/hive/src/test/resources/golden/join_nullsafe-32-6a0bf6127d4b042e67ae8ee15125fb87
new file mode 100644
index 0000000000000..ea001a222f357
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-32-6a0bf6127d4b042e67ae8ee15125fb87
@@ -0,0 +1,40 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	10	110	NULL
+NULL	10	148	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+NULL	35	110	NULL
+NULL	35	148	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	10	NULL
+NULL	110	48	NULL
+NULL	110	110	NULL
+NULL	110	148	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	10	NULL
+NULL	135	48	NULL
+NULL	135	110	NULL
+NULL	135	148	NULL
+10	NULL	NULL	10
+100	100	100	100
+110	NULL	NULL	110
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-33-63157d43422fcedadba408537ccecd5c b/sql/hive/src/test/resources/golden/join_nullsafe-33-63157d43422fcedadba408537ccecd5c
new file mode 100644
index 0000000000000..ea001a222f357
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-33-63157d43422fcedadba408537ccecd5c
@@ -0,0 +1,40 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	10	110	NULL
+NULL	10	148	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+NULL	35	110	NULL
+NULL	35	148	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	10	NULL
+NULL	110	48	NULL
+NULL	110	110	NULL
+NULL	110	148	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	10	NULL
+NULL	135	48	NULL
+NULL	135	110	NULL
+NULL	135	148	NULL
+10	NULL	NULL	10
+100	100	100	100
+110	NULL	NULL	110
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-34-9265f806b71c03061f93f9fbc88aa223 b/sql/hive/src/test/resources/golden/join_nullsafe-34-9265f806b71c03061f93f9fbc88aa223
new file mode 100644
index 0000000000000..1093bd89f6e3f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-34-9265f806b71c03061f93f9fbc88aa223
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	10	110	NULL
+NULL	10	148	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+NULL	35	110	NULL
+NULL	35	148	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	10	NULL
+NULL	110	48	NULL
+NULL	110	110	NULL
+NULL	110	148	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	10	NULL
+NULL	135	48	NULL
+NULL	135	110	NULL
+NULL	135	148	NULL
+10	NULL	NULL	10
+48	NULL	NULL	NULL
+100	100	100	100
+110	NULL	NULL	110
+148	NULL	NULL	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-35-95815bafb81cccb8129c20d399a446fc b/sql/hive/src/test/resources/golden/join_nullsafe-35-95815bafb81cccb8129c20d399a446fc
new file mode 100644
index 0000000000000..9cf0036674d6e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-35-95815bafb81cccb8129c20d399a446fc
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	35
+NULL	NULL	NULL	135
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	10	110	NULL
+NULL	10	148	NULL
+NULL	35	NULL	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+NULL	35	110	NULL
+NULL	35	148	NULL
+NULL	110	NULL	NULL
+NULL	110	NULL	NULL
+NULL	110	10	NULL
+NULL	110	48	NULL
+NULL	110	110	NULL
+NULL	110	148	NULL
+NULL	135	NULL	NULL
+NULL	135	NULL	NULL
+NULL	135	10	NULL
+NULL	135	48	NULL
+NULL	135	110	NULL
+NULL	135	148	NULL
+10	NULL	NULL	10
+100	100	100	100
+110	NULL	NULL	110
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-36-c4762c60cc93236b7647ebd32a40ce57 b/sql/hive/src/test/resources/golden/join_nullsafe-36-c4762c60cc93236b7647ebd32a40ce57
new file mode 100644
index 0000000000000..77f6a8ddd7c28
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-36-c4762c60cc93236b7647ebd32a40ce57
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	10
+NULL	35	NULL	35
+NULL	110	NULL	110
+NULL	135	NULL	135
+10	NULL	NULL	NULL
+10	NULL	NULL	NULL
+10	NULL	10	NULL
+10	NULL	48	NULL
+10	NULL	110	NULL
+10	NULL	148	NULL
+48	NULL	NULL	NULL
+48	NULL	NULL	NULL
+48	NULL	10	NULL
+48	NULL	48	NULL
+48	NULL	110	NULL
+48	NULL	148	NULL
+100	100	100	100
+110	NULL	NULL	NULL
+110	NULL	NULL	NULL
+110	NULL	10	NULL
+110	NULL	48	NULL
+110	NULL	110	NULL
+110	NULL	148	NULL
+148	NULL	NULL	NULL
+148	NULL	NULL	NULL
+148	NULL	10	NULL
+148	NULL	48	NULL
+148	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-37-a87893adfc73c9cc63ceab200bb56245 b/sql/hive/src/test/resources/golden/join_nullsafe-37-a87893adfc73c9cc63ceab200bb56245
new file mode 100644
index 0000000000000..77f6a8ddd7c28
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-37-a87893adfc73c9cc63ceab200bb56245
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	10
+NULL	35	NULL	35
+NULL	110	NULL	110
+NULL	135	NULL	135
+10	NULL	NULL	NULL
+10	NULL	NULL	NULL
+10	NULL	10	NULL
+10	NULL	48	NULL
+10	NULL	110	NULL
+10	NULL	148	NULL
+48	NULL	NULL	NULL
+48	NULL	NULL	NULL
+48	NULL	10	NULL
+48	NULL	48	NULL
+48	NULL	110	NULL
+48	NULL	148	NULL
+100	100	100	100
+110	NULL	NULL	NULL
+110	NULL	NULL	NULL
+110	NULL	10	NULL
+110	NULL	48	NULL
+110	NULL	110	NULL
+110	NULL	148	NULL
+148	NULL	NULL	NULL
+148	NULL	NULL	NULL
+148	NULL	10	NULL
+148	NULL	48	NULL
+148	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-38-e3dfe0044b44c8a49414479521acf762 b/sql/hive/src/test/resources/golden/join_nullsafe-38-e3dfe0044b44c8a49414479521acf762
new file mode 100644
index 0000000000000..77f6a8ddd7c28
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-38-e3dfe0044b44c8a49414479521acf762
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	10
+NULL	35	NULL	35
+NULL	110	NULL	110
+NULL	135	NULL	135
+10	NULL	NULL	NULL
+10	NULL	NULL	NULL
+10	NULL	10	NULL
+10	NULL	48	NULL
+10	NULL	110	NULL
+10	NULL	148	NULL
+48	NULL	NULL	NULL
+48	NULL	NULL	NULL
+48	NULL	10	NULL
+48	NULL	48	NULL
+48	NULL	110	NULL
+48	NULL	148	NULL
+100	100	100	100
+110	NULL	NULL	NULL
+110	NULL	NULL	NULL
+110	NULL	10	NULL
+110	NULL	48	NULL
+110	NULL	110	NULL
+110	NULL	148	NULL
+148	NULL	NULL	NULL
+148	NULL	NULL	NULL
+148	NULL	10	NULL
+148	NULL	48	NULL
+148	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-39-9a7e1f373b9c02e632d6c7c550b908ec b/sql/hive/src/test/resources/golden/join_nullsafe-39-9a7e1f373b9c02e632d6c7c550b908ec
new file mode 100644
index 0000000000000..77f6a8ddd7c28
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-39-9a7e1f373b9c02e632d6c7c550b908ec
@@ -0,0 +1,42 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	NULL	48	NULL
+NULL	NULL	110	NULL
+NULL	NULL	110	NULL
+NULL	NULL	148	NULL
+NULL	NULL	148	NULL
+NULL	10	NULL	10
+NULL	35	NULL	35
+NULL	110	NULL	110
+NULL	135	NULL	135
+10	NULL	NULL	NULL
+10	NULL	NULL	NULL
+10	NULL	10	NULL
+10	NULL	48	NULL
+10	NULL	110	NULL
+10	NULL	148	NULL
+48	NULL	NULL	NULL
+48	NULL	NULL	NULL
+48	NULL	10	NULL
+48	NULL	48	NULL
+48	NULL	110	NULL
+48	NULL	148	NULL
+100	100	100	100
+110	NULL	NULL	NULL
+110	NULL	NULL	NULL
+110	NULL	10	NULL
+110	NULL	48	NULL
+110	NULL	110	NULL
+110	NULL	148	NULL
+148	NULL	NULL	NULL
+148	NULL	NULL	NULL
+148	NULL	10	NULL
+148	NULL	48	NULL
+148	NULL	110	NULL
+148	NULL	148	NULL
+200	200	200	200
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-4-644c616d87ae426eb2f8c71638045185 b/sql/hive/src/test/resources/golden/join_nullsafe-4-644c616d87ae426eb2f8c71638045185
new file mode 100644
index 0000000000000..1cc70524f9d6d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-4-644c616d87ae426eb2f8c71638045185
@@ -0,0 +1,11 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
+10	NULL	NULL	10
+100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-40-3c868718e4c120cb9a72ab7318c75be3 b/sql/hive/src/test/resources/golden/join_nullsafe-40-3c868718e4c120cb9a72ab7318c75be3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-41-1f7d8737c3e2d74d5ad865535d729811 b/sql/hive/src/test/resources/golden/join_nullsafe-41-1f7d8737c3e2d74d5ad865535d729811
new file mode 100644
index 0000000000000..421049d6e509e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-41-1f7d8737c3e2d74d5ad865535d729811
@@ -0,0 +1,9 @@
+NULL	NULL	NULL	NULL
+NULL	NULL	10	NULL
+NULL	NULL	48	NULL
+NULL	10	NULL	NULL
+NULL	10	10	NULL
+NULL	10	48	NULL
+NULL	35	NULL	NULL
+NULL	35	10	NULL
+NULL	35	48	NULL
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-5-1e393de94850e92b3b00536aacc9371f b/sql/hive/src/test/resources/golden/join_nullsafe-5-1e393de94850e92b3b00536aacc9371f
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-6-d66451815212e7d17744184e74c6b0a0 b/sql/hive/src/test/resources/golden/join_nullsafe-6-d66451815212e7d17744184e74c6b0a0
new file mode 100644
index 0000000000000..aec3122cae5f9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-6-d66451815212e7d17744184e74c6b0a0
@@ -0,0 +1,2 @@
+10	NULL	NULL	10	10	NULL
+100	100	100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-7-a3ad3cc301d9884898d3e6ab6c792d4c b/sql/hive/src/test/resources/golden/join_nullsafe-7-a3ad3cc301d9884898d3e6ab6c792d4c
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-8-cc7527bcf746ab7e2cd9f28db0ead0ac b/sql/hive/src/test/resources/golden/join_nullsafe-8-cc7527bcf746ab7e2cd9f28db0ead0ac
new file mode 100644
index 0000000000000..30db79efa79b4
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/join_nullsafe-8-cc7527bcf746ab7e2cd9f28db0ead0ac
@@ -0,0 +1,29 @@
+NULL	NULL	NULL	NULL	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	10
+NULL	NULL	NULL	NULL	NULL	35
+NULL	NULL	10	NULL	NULL	NULL
+NULL	NULL	10	NULL	NULL	10
+NULL	NULL	10	NULL	NULL	35
+NULL	NULL	48	NULL	NULL	NULL
+NULL	NULL	48	NULL	NULL	10
+NULL	NULL	48	NULL	NULL	35
+NULL	10	NULL	NULL	NULL	NULL
+NULL	10	NULL	NULL	NULL	10
+NULL	10	NULL	NULL	NULL	35
+NULL	10	10	NULL	NULL	NULL
+NULL	10	10	NULL	NULL	10
+NULL	10	10	NULL	NULL	35
+NULL	10	48	NULL	NULL	NULL
+NULL	10	48	NULL	NULL	10
+NULL	10	48	NULL	NULL	35
+NULL	35	NULL	NULL	NULL	NULL
+NULL	35	NULL	NULL	NULL	10
+NULL	35	NULL	NULL	NULL	35
+NULL	35	10	NULL	NULL	NULL
+NULL	35	10	NULL	NULL	10
+NULL	35	10	NULL	NULL	35
+NULL	35	48	NULL	NULL	NULL
+NULL	35	48	NULL	NULL	10
+NULL	35	48	NULL	NULL	35
+10	NULL	NULL	10	10	NULL
+100	100	100	100	100	100
diff --git a/sql/hive/src/test/resources/golden/join_nullsafe-9-88f6f40959b0d2faabd9d4b3cd853809 b/sql/hive/src/test/resources/golden/join_nullsafe-9-88f6f40959b0d2faabd9d4b3cd853809
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-0-79b294d0081c3dfd36c5b8b5e78dc7fb b/sql/hive/src/test/resources/golden/limit_pushdown_negative-0-79b294d0081c3dfd36c5b8b5e78dc7fb
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/limit_pushdown_negative-0-79b294d0081c3dfd36c5b8b5e78dc7fb
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-1-f87339637a48bd1533493ebbed5432a7 b/sql/hive/src/test/resources/golden/limit_pushdown_negative-1-f87339637a48bd1533493ebbed5432a7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-2-de7e5ac581b870fff10dc82c75c1c79e b/sql/hive/src/test/resources/golden/limit_pushdown_negative-2-de7e5ac581b870fff10dc82c75c1c79e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-3-be440c3f959ca53b758481aa90551984 b/sql/hive/src/test/resources/golden/limit_pushdown_negative-3-be440c3f959ca53b758481aa90551984
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-4-4dedc8057d76af264c198beaacd7f000 b/sql/hive/src/test/resources/golden/limit_pushdown_negative-4-4dedc8057d76af264c198beaacd7f000
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-5-543a20e69bd8987bc37a22c1c7ef33f1 b/sql/hive/src/test/resources/golden/limit_pushdown_negative-5-543a20e69bd8987bc37a22c1c7ef33f1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-6-3f8274466914ad200b33a2c83fa6dab5 b/sql/hive/src/test/resources/golden/limit_pushdown_negative-6-3f8274466914ad200b33a2c83fa6dab5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/limit_pushdown_negative-7-fb7bf3783d4fb43673a202c4111d9092 b/sql/hive/src/test/resources/golden/limit_pushdown_negative-7-fb7bf3783d4fb43673a202c4111d9092
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-0-1436cccda63b78dd6e43a399da6cc474 b/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-0-1436cccda63b78dd6e43a399da6cc474
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-1-8d9bf54373f45bc35f8cb6e82771b154 b/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-1-8d9bf54373f45bc35f8cb6e82771b154
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-2-7816c17905012cf381abf93d230faa8d b/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-2-7816c17905012cf381abf93d230faa8d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-3-90089a6db3c3d8ee5ff5ea6b9153b3cc b/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-3-90089a6db3c3d8ee5ff5ea6b9153b3cc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-4-8caed2a6e80250a6d38a59388679c298 b/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-4-8caed2a6e80250a6d38a59388679c298
new file mode 100644
index 0000000000000..f369f21e1833f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/partition_based_table_scan_with_different_serde-4-8caed2a6e80250a6d38a59388679c298
@@ -0,0 +1,2 @@
+100	100	2010-01-01
+200	200	2010-01-02
diff --git a/sql/hive/src/test/resources/golden/single case-0-c264e319c52f1840a32959d552b99e73 b/sql/hive/src/test/resources/golden/single case-0-c264e319c52f1840a32959d552b99e73
new file mode 100644
index 0000000000000..d00491fd7e5bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/single case-0-c264e319c52f1840a32959d552b99e73	
@@ -0,0 +1 @@
+1
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-10-343c75daac6695917608c17db8bf473e b/sql/hive/src/test/resources/golden/timestamp_1-10-343c75daac6695917608c17db8bf473e
new file mode 100644
index 0000000000000..b77c91ccee412
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-10-343c75daac6695917608c17db8bf473e
@@ -0,0 +1 @@
+1.293872461E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-11-cf19f7359a6d3456c4526b2c69f92d6a b/sql/hive/src/test/resources/golden/timestamp_1-11-cf19f7359a6d3456c4526b2c69f92d6a
new file mode 100644
index 0000000000000..bc2423dc08a1b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-11-cf19f7359a6d3456c4526b2c69f92d6a
@@ -0,0 +1 @@
+2011-01-01 01:01:01
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-12-6328d3b3dfd295dd5ec453ffb47ff4d0 b/sql/hive/src/test/resources/golden/timestamp_1-12-6328d3b3dfd295dd5ec453ffb47ff4d0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-13-90269c1e50c7ae8e75ca9cc297982135 b/sql/hive/src/test/resources/golden/timestamp_1-13-90269c1e50c7ae8e75ca9cc297982135
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-13-90269c1e50c7ae8e75ca9cc297982135
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-14-e6bfca320c4ee3aff39cf2f179d57da6 b/sql/hive/src/test/resources/golden/timestamp_1-14-e6bfca320c4ee3aff39cf2f179d57da6
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-14-e6bfca320c4ee3aff39cf2f179d57da6
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-15-d0291a9bd42054b2732cb4f54cf39ae7 b/sql/hive/src/test/resources/golden/timestamp_1-15-d0291a9bd42054b2732cb4f54cf39ae7
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-15-d0291a9bd42054b2732cb4f54cf39ae7
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-16-e7b398d2a8107a42419c83771bda41e6 b/sql/hive/src/test/resources/golden/timestamp_1-16-e7b398d2a8107a42419c83771bda41e6
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-16-e7b398d2a8107a42419c83771bda41e6
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-17-a3eeec08bccae78d0d94ad2cb923e1cf b/sql/hive/src/test/resources/golden/timestamp_1-17-a3eeec08bccae78d0d94ad2cb923e1cf
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-17-a3eeec08bccae78d0d94ad2cb923e1cf
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-18-67f274bf16de625cf4e85af0c6185cac b/sql/hive/src/test/resources/golden/timestamp_1-18-67f274bf16de625cf4e85af0c6185cac
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-18-67f274bf16de625cf4e85af0c6185cac
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-19-343c75daac6695917608c17db8bf473e b/sql/hive/src/test/resources/golden/timestamp_1-19-343c75daac6695917608c17db8bf473e
new file mode 100644
index 0000000000000..b77c91ccee412
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-19-343c75daac6695917608c17db8bf473e
@@ -0,0 +1 @@
+1.293872461E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-20-cf19f7359a6d3456c4526b2c69f92d6a b/sql/hive/src/test/resources/golden/timestamp_1-20-cf19f7359a6d3456c4526b2c69f92d6a
new file mode 100644
index 0000000000000..bc2423dc08a1b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-20-cf19f7359a6d3456c4526b2c69f92d6a
@@ -0,0 +1 @@
+2011-01-01 01:01:01
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-21-d8fff1a6c464e50eb955babfafb0b98e b/sql/hive/src/test/resources/golden/timestamp_1-21-d8fff1a6c464e50eb955babfafb0b98e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-22-90269c1e50c7ae8e75ca9cc297982135 b/sql/hive/src/test/resources/golden/timestamp_1-22-90269c1e50c7ae8e75ca9cc297982135
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-22-90269c1e50c7ae8e75ca9cc297982135
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-23-e6bfca320c4ee3aff39cf2f179d57da6 b/sql/hive/src/test/resources/golden/timestamp_1-23-e6bfca320c4ee3aff39cf2f179d57da6
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-23-e6bfca320c4ee3aff39cf2f179d57da6
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-24-d0291a9bd42054b2732cb4f54cf39ae7 b/sql/hive/src/test/resources/golden/timestamp_1-24-d0291a9bd42054b2732cb4f54cf39ae7
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-24-d0291a9bd42054b2732cb4f54cf39ae7
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-25-e7b398d2a8107a42419c83771bda41e6 b/sql/hive/src/test/resources/golden/timestamp_1-25-e7b398d2a8107a42419c83771bda41e6
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-25-e7b398d2a8107a42419c83771bda41e6
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-26-a3eeec08bccae78d0d94ad2cb923e1cf b/sql/hive/src/test/resources/golden/timestamp_1-26-a3eeec08bccae78d0d94ad2cb923e1cf
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-26-a3eeec08bccae78d0d94ad2cb923e1cf
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-27-67f274bf16de625cf4e85af0c6185cac b/sql/hive/src/test/resources/golden/timestamp_1-27-67f274bf16de625cf4e85af0c6185cac
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-27-67f274bf16de625cf4e85af0c6185cac
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-28-343c75daac6695917608c17db8bf473e b/sql/hive/src/test/resources/golden/timestamp_1-28-343c75daac6695917608c17db8bf473e
new file mode 100644
index 0000000000000..2158e77f78768
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-28-343c75daac6695917608c17db8bf473e
@@ -0,0 +1 @@
+1.2938724611E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-29-cf19f7359a6d3456c4526b2c69f92d6a b/sql/hive/src/test/resources/golden/timestamp_1-29-cf19f7359a6d3456c4526b2c69f92d6a
new file mode 100644
index 0000000000000..4bdd802b9cda9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-29-cf19f7359a6d3456c4526b2c69f92d6a
@@ -0,0 +1 @@
+2011-01-01 01:01:01.1
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-3-819633b45e3e1779bca6bcb7b77fe5a1 b/sql/hive/src/test/resources/golden/timestamp_1-3-819633b45e3e1779bca6bcb7b77fe5a1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-30-273256141c33eb88194cad22eb940d21 b/sql/hive/src/test/resources/golden/timestamp_1-30-273256141c33eb88194cad22eb940d21
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-31-90269c1e50c7ae8e75ca9cc297982135 b/sql/hive/src/test/resources/golden/timestamp_1-31-90269c1e50c7ae8e75ca9cc297982135
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-31-90269c1e50c7ae8e75ca9cc297982135
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-32-e6bfca320c4ee3aff39cf2f179d57da6 b/sql/hive/src/test/resources/golden/timestamp_1-32-e6bfca320c4ee3aff39cf2f179d57da6
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-32-e6bfca320c4ee3aff39cf2f179d57da6
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-33-d0291a9bd42054b2732cb4f54cf39ae7 b/sql/hive/src/test/resources/golden/timestamp_1-33-d0291a9bd42054b2732cb4f54cf39ae7
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-33-d0291a9bd42054b2732cb4f54cf39ae7
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-34-e7b398d2a8107a42419c83771bda41e6 b/sql/hive/src/test/resources/golden/timestamp_1-34-e7b398d2a8107a42419c83771bda41e6
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-34-e7b398d2a8107a42419c83771bda41e6
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-35-a3eeec08bccae78d0d94ad2cb923e1cf b/sql/hive/src/test/resources/golden/timestamp_1-35-a3eeec08bccae78d0d94ad2cb923e1cf
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-35-a3eeec08bccae78d0d94ad2cb923e1cf
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-36-67f274bf16de625cf4e85af0c6185cac b/sql/hive/src/test/resources/golden/timestamp_1-36-67f274bf16de625cf4e85af0c6185cac
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-36-67f274bf16de625cf4e85af0c6185cac
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-37-343c75daac6695917608c17db8bf473e b/sql/hive/src/test/resources/golden/timestamp_1-37-343c75daac6695917608c17db8bf473e
new file mode 100644
index 0000000000000..b71ff60863360
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-37-343c75daac6695917608c17db8bf473e
@@ -0,0 +1 @@
+1.2938724610001E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-38-cf19f7359a6d3456c4526b2c69f92d6a b/sql/hive/src/test/resources/golden/timestamp_1-38-cf19f7359a6d3456c4526b2c69f92d6a
new file mode 100644
index 0000000000000..8b014c4cd8d6e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-38-cf19f7359a6d3456c4526b2c69f92d6a
@@ -0,0 +1 @@
+2011-01-01 01:01:01.0001
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-39-b2fe5cc7c8ee62d3bb0c120c9a6c305d b/sql/hive/src/test/resources/golden/timestamp_1-39-b2fe5cc7c8ee62d3bb0c120c9a6c305d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-4-90269c1e50c7ae8e75ca9cc297982135 b/sql/hive/src/test/resources/golden/timestamp_1-4-90269c1e50c7ae8e75ca9cc297982135
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-4-90269c1e50c7ae8e75ca9cc297982135
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-40-90269c1e50c7ae8e75ca9cc297982135 b/sql/hive/src/test/resources/golden/timestamp_1-40-90269c1e50c7ae8e75ca9cc297982135
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-40-90269c1e50c7ae8e75ca9cc297982135
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-41-e6bfca320c4ee3aff39cf2f179d57da6 b/sql/hive/src/test/resources/golden/timestamp_1-41-e6bfca320c4ee3aff39cf2f179d57da6
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-41-e6bfca320c4ee3aff39cf2f179d57da6
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-42-d0291a9bd42054b2732cb4f54cf39ae7 b/sql/hive/src/test/resources/golden/timestamp_1-42-d0291a9bd42054b2732cb4f54cf39ae7
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-42-d0291a9bd42054b2732cb4f54cf39ae7
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-43-e7b398d2a8107a42419c83771bda41e6 b/sql/hive/src/test/resources/golden/timestamp_1-43-e7b398d2a8107a42419c83771bda41e6
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-43-e7b398d2a8107a42419c83771bda41e6
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-44-a3eeec08bccae78d0d94ad2cb923e1cf b/sql/hive/src/test/resources/golden/timestamp_1-44-a3eeec08bccae78d0d94ad2cb923e1cf
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-44-a3eeec08bccae78d0d94ad2cb923e1cf
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-45-67f274bf16de625cf4e85af0c6185cac b/sql/hive/src/test/resources/golden/timestamp_1-45-67f274bf16de625cf4e85af0c6185cac
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-45-67f274bf16de625cf4e85af0c6185cac
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-46-343c75daac6695917608c17db8bf473e b/sql/hive/src/test/resources/golden/timestamp_1-46-343c75daac6695917608c17db8bf473e
new file mode 100644
index 0000000000000..b71ff60863360
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-46-343c75daac6695917608c17db8bf473e
@@ -0,0 +1 @@
+1.2938724610001E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-47-cf19f7359a6d3456c4526b2c69f92d6a b/sql/hive/src/test/resources/golden/timestamp_1-47-cf19f7359a6d3456c4526b2c69f92d6a
new file mode 100644
index 0000000000000..8b014c4cd8d6e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-47-cf19f7359a6d3456c4526b2c69f92d6a
@@ -0,0 +1 @@
+2011-01-01 01:01:01.0001
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-48-7029255241de8e8b9710801319990044 b/sql/hive/src/test/resources/golden/timestamp_1-48-7029255241de8e8b9710801319990044
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-49-90269c1e50c7ae8e75ca9cc297982135 b/sql/hive/src/test/resources/golden/timestamp_1-49-90269c1e50c7ae8e75ca9cc297982135
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-49-90269c1e50c7ae8e75ca9cc297982135
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-5-e6bfca320c4ee3aff39cf2f179d57da6 b/sql/hive/src/test/resources/golden/timestamp_1-5-e6bfca320c4ee3aff39cf2f179d57da6
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-5-e6bfca320c4ee3aff39cf2f179d57da6
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-50-e6bfca320c4ee3aff39cf2f179d57da6 b/sql/hive/src/test/resources/golden/timestamp_1-50-e6bfca320c4ee3aff39cf2f179d57da6
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-50-e6bfca320c4ee3aff39cf2f179d57da6
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-51-d0291a9bd42054b2732cb4f54cf39ae7 b/sql/hive/src/test/resources/golden/timestamp_1-51-d0291a9bd42054b2732cb4f54cf39ae7
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-51-d0291a9bd42054b2732cb4f54cf39ae7
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-52-e7b398d2a8107a42419c83771bda41e6 b/sql/hive/src/test/resources/golden/timestamp_1-52-e7b398d2a8107a42419c83771bda41e6
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-52-e7b398d2a8107a42419c83771bda41e6
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-53-a3eeec08bccae78d0d94ad2cb923e1cf b/sql/hive/src/test/resources/golden/timestamp_1-53-a3eeec08bccae78d0d94ad2cb923e1cf
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-53-a3eeec08bccae78d0d94ad2cb923e1cf
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-54-67f274bf16de625cf4e85af0c6185cac b/sql/hive/src/test/resources/golden/timestamp_1-54-67f274bf16de625cf4e85af0c6185cac
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-54-67f274bf16de625cf4e85af0c6185cac
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-55-343c75daac6695917608c17db8bf473e b/sql/hive/src/test/resources/golden/timestamp_1-55-343c75daac6695917608c17db8bf473e
new file mode 100644
index 0000000000000..3eefb349894ac
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-55-343c75daac6695917608c17db8bf473e
@@ -0,0 +1 @@
+1.293872461001E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-56-cf19f7359a6d3456c4526b2c69f92d6a b/sql/hive/src/test/resources/golden/timestamp_1-56-cf19f7359a6d3456c4526b2c69f92d6a
new file mode 100644
index 0000000000000..acce9d97b5eba
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-56-cf19f7359a6d3456c4526b2c69f92d6a
@@ -0,0 +1 @@
+2011-01-01 01:01:01.001000011
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-57-d362501d0176855077e65f8faf067fa8 b/sql/hive/src/test/resources/golden/timestamp_1-57-d362501d0176855077e65f8faf067fa8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-6-d0291a9bd42054b2732cb4f54cf39ae7 b/sql/hive/src/test/resources/golden/timestamp_1-6-d0291a9bd42054b2732cb4f54cf39ae7
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-6-d0291a9bd42054b2732cb4f54cf39ae7
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-7-e7b398d2a8107a42419c83771bda41e6 b/sql/hive/src/test/resources/golden/timestamp_1-7-e7b398d2a8107a42419c83771bda41e6
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-7-e7b398d2a8107a42419c83771bda41e6
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-8-a3eeec08bccae78d0d94ad2cb923e1cf b/sql/hive/src/test/resources/golden/timestamp_1-8-a3eeec08bccae78d0d94ad2cb923e1cf
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-8-a3eeec08bccae78d0d94ad2cb923e1cf
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_1-9-67f274bf16de625cf4e85af0c6185cac b/sql/hive/src/test/resources/golden/timestamp_1-9-67f274bf16de625cf4e85af0c6185cac
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_1-9-67f274bf16de625cf4e85af0c6185cac
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-10-5181279a0bf8939fe46ddacae015dad8 b/sql/hive/src/test/resources/golden/timestamp_2-10-5181279a0bf8939fe46ddacae015dad8
new file mode 100644
index 0000000000000..b77c91ccee412
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-10-5181279a0bf8939fe46ddacae015dad8
@@ -0,0 +1 @@
+1.293872461E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-11-240fce5f58794fa051824e8732c00c03 b/sql/hive/src/test/resources/golden/timestamp_2-11-240fce5f58794fa051824e8732c00c03
new file mode 100644
index 0000000000000..bc2423dc08a1b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-11-240fce5f58794fa051824e8732c00c03
@@ -0,0 +1 @@
+2011-01-01 01:01:01
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-12-7350308cbf49d6ebd6599d3802750acd b/sql/hive/src/test/resources/golden/timestamp_2-12-7350308cbf49d6ebd6599d3802750acd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-13-25f6ec69328af6cba76899194e0dd84e b/sql/hive/src/test/resources/golden/timestamp_2-13-25f6ec69328af6cba76899194e0dd84e
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-13-25f6ec69328af6cba76899194e0dd84e
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-14-93c769be4cff93bea6e62bfe4e2a8742 b/sql/hive/src/test/resources/golden/timestamp_2-14-93c769be4cff93bea6e62bfe4e2a8742
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-14-93c769be4cff93bea6e62bfe4e2a8742
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-15-5bdbf67419cc060b82d091d80ce59bf9 b/sql/hive/src/test/resources/golden/timestamp_2-15-5bdbf67419cc060b82d091d80ce59bf9
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-15-5bdbf67419cc060b82d091d80ce59bf9
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-16-de3c42ab06c17ae895fd7deaf7bd9571 b/sql/hive/src/test/resources/golden/timestamp_2-16-de3c42ab06c17ae895fd7deaf7bd9571
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-16-de3c42ab06c17ae895fd7deaf7bd9571
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-17-da3937d21b7c2cfe1e624e812ae1d3ef b/sql/hive/src/test/resources/golden/timestamp_2-17-da3937d21b7c2cfe1e624e812ae1d3ef
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-17-da3937d21b7c2cfe1e624e812ae1d3ef
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-18-252aebfe7882335d31bfc53a8705b7a b/sql/hive/src/test/resources/golden/timestamp_2-18-252aebfe7882335d31bfc53a8705b7a
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-18-252aebfe7882335d31bfc53a8705b7a
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-19-5181279a0bf8939fe46ddacae015dad8 b/sql/hive/src/test/resources/golden/timestamp_2-19-5181279a0bf8939fe46ddacae015dad8
new file mode 100644
index 0000000000000..b77c91ccee412
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-19-5181279a0bf8939fe46ddacae015dad8
@@ -0,0 +1 @@
+1.293872461E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-20-240fce5f58794fa051824e8732c00c03 b/sql/hive/src/test/resources/golden/timestamp_2-20-240fce5f58794fa051824e8732c00c03
new file mode 100644
index 0000000000000..bc2423dc08a1b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-20-240fce5f58794fa051824e8732c00c03
@@ -0,0 +1 @@
+2011-01-01 01:01:01
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-21-5eb58e5d3c5b9f766f0b497bf59c47b b/sql/hive/src/test/resources/golden/timestamp_2-21-5eb58e5d3c5b9f766f0b497bf59c47b
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-22-25f6ec69328af6cba76899194e0dd84e b/sql/hive/src/test/resources/golden/timestamp_2-22-25f6ec69328af6cba76899194e0dd84e
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-22-25f6ec69328af6cba76899194e0dd84e
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-23-93c769be4cff93bea6e62bfe4e2a8742 b/sql/hive/src/test/resources/golden/timestamp_2-23-93c769be4cff93bea6e62bfe4e2a8742
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-23-93c769be4cff93bea6e62bfe4e2a8742
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-24-5bdbf67419cc060b82d091d80ce59bf9 b/sql/hive/src/test/resources/golden/timestamp_2-24-5bdbf67419cc060b82d091d80ce59bf9
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-24-5bdbf67419cc060b82d091d80ce59bf9
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-25-de3c42ab06c17ae895fd7deaf7bd9571 b/sql/hive/src/test/resources/golden/timestamp_2-25-de3c42ab06c17ae895fd7deaf7bd9571
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-25-de3c42ab06c17ae895fd7deaf7bd9571
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-26-da3937d21b7c2cfe1e624e812ae1d3ef b/sql/hive/src/test/resources/golden/timestamp_2-26-da3937d21b7c2cfe1e624e812ae1d3ef
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-26-da3937d21b7c2cfe1e624e812ae1d3ef
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-27-252aebfe7882335d31bfc53a8705b7a b/sql/hive/src/test/resources/golden/timestamp_2-27-252aebfe7882335d31bfc53a8705b7a
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-27-252aebfe7882335d31bfc53a8705b7a
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-28-5181279a0bf8939fe46ddacae015dad8 b/sql/hive/src/test/resources/golden/timestamp_2-28-5181279a0bf8939fe46ddacae015dad8
new file mode 100644
index 0000000000000..2158e77f78768
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-28-5181279a0bf8939fe46ddacae015dad8
@@ -0,0 +1 @@
+1.2938724611E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-29-240fce5f58794fa051824e8732c00c03 b/sql/hive/src/test/resources/golden/timestamp_2-29-240fce5f58794fa051824e8732c00c03
new file mode 100644
index 0000000000000..4bdd802b9cda9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-29-240fce5f58794fa051824e8732c00c03
@@ -0,0 +1 @@
+2011-01-01 01:01:01.1
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-3-a95a52c3a66e1f211ea04a0a10bd3b74 b/sql/hive/src/test/resources/golden/timestamp_2-3-a95a52c3a66e1f211ea04a0a10bd3b74
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-30-ffe6b6ddaaba84152074f7781fba2243 b/sql/hive/src/test/resources/golden/timestamp_2-30-ffe6b6ddaaba84152074f7781fba2243
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-31-25f6ec69328af6cba76899194e0dd84e b/sql/hive/src/test/resources/golden/timestamp_2-31-25f6ec69328af6cba76899194e0dd84e
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-31-25f6ec69328af6cba76899194e0dd84e
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-32-93c769be4cff93bea6e62bfe4e2a8742 b/sql/hive/src/test/resources/golden/timestamp_2-32-93c769be4cff93bea6e62bfe4e2a8742
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-32-93c769be4cff93bea6e62bfe4e2a8742
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-33-5bdbf67419cc060b82d091d80ce59bf9 b/sql/hive/src/test/resources/golden/timestamp_2-33-5bdbf67419cc060b82d091d80ce59bf9
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-33-5bdbf67419cc060b82d091d80ce59bf9
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-34-de3c42ab06c17ae895fd7deaf7bd9571 b/sql/hive/src/test/resources/golden/timestamp_2-34-de3c42ab06c17ae895fd7deaf7bd9571
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-34-de3c42ab06c17ae895fd7deaf7bd9571
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-35-da3937d21b7c2cfe1e624e812ae1d3ef b/sql/hive/src/test/resources/golden/timestamp_2-35-da3937d21b7c2cfe1e624e812ae1d3ef
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-35-da3937d21b7c2cfe1e624e812ae1d3ef
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-36-252aebfe7882335d31bfc53a8705b7a b/sql/hive/src/test/resources/golden/timestamp_2-36-252aebfe7882335d31bfc53a8705b7a
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-36-252aebfe7882335d31bfc53a8705b7a
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-37-5181279a0bf8939fe46ddacae015dad8 b/sql/hive/src/test/resources/golden/timestamp_2-37-5181279a0bf8939fe46ddacae015dad8
new file mode 100644
index 0000000000000..b71ff60863360
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-37-5181279a0bf8939fe46ddacae015dad8
@@ -0,0 +1 @@
+1.2938724610001E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-38-240fce5f58794fa051824e8732c00c03 b/sql/hive/src/test/resources/golden/timestamp_2-38-240fce5f58794fa051824e8732c00c03
new file mode 100644
index 0000000000000..8b014c4cd8d6e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-38-240fce5f58794fa051824e8732c00c03
@@ -0,0 +1 @@
+2011-01-01 01:01:01.0001
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-39-8236608f28681eac5503195096a34181 b/sql/hive/src/test/resources/golden/timestamp_2-39-8236608f28681eac5503195096a34181
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-4-25f6ec69328af6cba76899194e0dd84e b/sql/hive/src/test/resources/golden/timestamp_2-4-25f6ec69328af6cba76899194e0dd84e
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-4-25f6ec69328af6cba76899194e0dd84e
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-40-25f6ec69328af6cba76899194e0dd84e b/sql/hive/src/test/resources/golden/timestamp_2-40-25f6ec69328af6cba76899194e0dd84e
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-40-25f6ec69328af6cba76899194e0dd84e
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-41-93c769be4cff93bea6e62bfe4e2a8742 b/sql/hive/src/test/resources/golden/timestamp_2-41-93c769be4cff93bea6e62bfe4e2a8742
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-41-93c769be4cff93bea6e62bfe4e2a8742
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-42-5bdbf67419cc060b82d091d80ce59bf9 b/sql/hive/src/test/resources/golden/timestamp_2-42-5bdbf67419cc060b82d091d80ce59bf9
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-42-5bdbf67419cc060b82d091d80ce59bf9
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-43-de3c42ab06c17ae895fd7deaf7bd9571 b/sql/hive/src/test/resources/golden/timestamp_2-43-de3c42ab06c17ae895fd7deaf7bd9571
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-43-de3c42ab06c17ae895fd7deaf7bd9571
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-44-da3937d21b7c2cfe1e624e812ae1d3ef b/sql/hive/src/test/resources/golden/timestamp_2-44-da3937d21b7c2cfe1e624e812ae1d3ef
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-44-da3937d21b7c2cfe1e624e812ae1d3ef
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-45-252aebfe7882335d31bfc53a8705b7a b/sql/hive/src/test/resources/golden/timestamp_2-45-252aebfe7882335d31bfc53a8705b7a
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-45-252aebfe7882335d31bfc53a8705b7a
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-46-5181279a0bf8939fe46ddacae015dad8 b/sql/hive/src/test/resources/golden/timestamp_2-46-5181279a0bf8939fe46ddacae015dad8
new file mode 100644
index 0000000000000..b71ff60863360
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-46-5181279a0bf8939fe46ddacae015dad8
@@ -0,0 +1 @@
+1.2938724610001E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-47-240fce5f58794fa051824e8732c00c03 b/sql/hive/src/test/resources/golden/timestamp_2-47-240fce5f58794fa051824e8732c00c03
new file mode 100644
index 0000000000000..8b014c4cd8d6e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-47-240fce5f58794fa051824e8732c00c03
@@ -0,0 +1 @@
+2011-01-01 01:01:01.0001
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-48-654e5533ec6dc911996abc7e47af8ccb b/sql/hive/src/test/resources/golden/timestamp_2-48-654e5533ec6dc911996abc7e47af8ccb
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-49-25f6ec69328af6cba76899194e0dd84e b/sql/hive/src/test/resources/golden/timestamp_2-49-25f6ec69328af6cba76899194e0dd84e
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-49-25f6ec69328af6cba76899194e0dd84e
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-5-93c769be4cff93bea6e62bfe4e2a8742 b/sql/hive/src/test/resources/golden/timestamp_2-5-93c769be4cff93bea6e62bfe4e2a8742
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-5-93c769be4cff93bea6e62bfe4e2a8742
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-50-93c769be4cff93bea6e62bfe4e2a8742 b/sql/hive/src/test/resources/golden/timestamp_2-50-93c769be4cff93bea6e62bfe4e2a8742
new file mode 100644
index 0000000000000..987e7ca9a76df
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-50-93c769be4cff93bea6e62bfe4e2a8742
@@ -0,0 +1 @@
+77
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-51-5bdbf67419cc060b82d091d80ce59bf9 b/sql/hive/src/test/resources/golden/timestamp_2-51-5bdbf67419cc060b82d091d80ce59bf9
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-51-5bdbf67419cc060b82d091d80ce59bf9
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-52-de3c42ab06c17ae895fd7deaf7bd9571 b/sql/hive/src/test/resources/golden/timestamp_2-52-de3c42ab06c17ae895fd7deaf7bd9571
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-52-de3c42ab06c17ae895fd7deaf7bd9571
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-53-da3937d21b7c2cfe1e624e812ae1d3ef b/sql/hive/src/test/resources/golden/timestamp_2-53-da3937d21b7c2cfe1e624e812ae1d3ef
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-53-da3937d21b7c2cfe1e624e812ae1d3ef
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-54-252aebfe7882335d31bfc53a8705b7a b/sql/hive/src/test/resources/golden/timestamp_2-54-252aebfe7882335d31bfc53a8705b7a
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-54-252aebfe7882335d31bfc53a8705b7a
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-55-5181279a0bf8939fe46ddacae015dad8 b/sql/hive/src/test/resources/golden/timestamp_2-55-5181279a0bf8939fe46ddacae015dad8
new file mode 100644
index 0000000000000..3eefb349894ac
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-55-5181279a0bf8939fe46ddacae015dad8
@@ -0,0 +1 @@
+1.293872461001E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-56-240fce5f58794fa051824e8732c00c03 b/sql/hive/src/test/resources/golden/timestamp_2-56-240fce5f58794fa051824e8732c00c03
new file mode 100644
index 0000000000000..acce9d97b5eba
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-56-240fce5f58794fa051824e8732c00c03
@@ -0,0 +1 @@
+2011-01-01 01:01:01.001000011
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-57-ea7192a4a5a985bcc8aab9aa79d9f028 b/sql/hive/src/test/resources/golden/timestamp_2-57-ea7192a4a5a985bcc8aab9aa79d9f028
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-6-5bdbf67419cc060b82d091d80ce59bf9 b/sql/hive/src/test/resources/golden/timestamp_2-6-5bdbf67419cc060b82d091d80ce59bf9
new file mode 100644
index 0000000000000..8f3a49478b26b
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-6-5bdbf67419cc060b82d091d80ce59bf9
@@ -0,0 +1 @@
+-4787
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-7-de3c42ab06c17ae895fd7deaf7bd9571 b/sql/hive/src/test/resources/golden/timestamp_2-7-de3c42ab06c17ae895fd7deaf7bd9571
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-7-de3c42ab06c17ae895fd7deaf7bd9571
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-8-da3937d21b7c2cfe1e624e812ae1d3ef b/sql/hive/src/test/resources/golden/timestamp_2-8-da3937d21b7c2cfe1e624e812ae1d3ef
new file mode 100644
index 0000000000000..211a320b9ca72
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-8-da3937d21b7c2cfe1e624e812ae1d3ef
@@ -0,0 +1 @@
+1293872461
diff --git a/sql/hive/src/test/resources/golden/timestamp_2-9-252aebfe7882335d31bfc53a8705b7a b/sql/hive/src/test/resources/golden/timestamp_2-9-252aebfe7882335d31bfc53a8705b7a
new file mode 100644
index 0000000000000..502f94a71edbd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_2-9-252aebfe7882335d31bfc53a8705b7a
@@ -0,0 +1 @@
+1.29387251E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-10-7b1ec929239ee305ea9da46ebb990c67 b/sql/hive/src/test/resources/golden/timestamp_3-10-7b1ec929239ee305ea9da46ebb990c67
new file mode 100644
index 0000000000000..1b0a140b5a384
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-10-7b1ec929239ee305ea9da46ebb990c67
@@ -0,0 +1 @@
+1.3041352164485E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-11-a63f40f6c4a022c16f8cf810e3b7ed2a b/sql/hive/src/test/resources/golden/timestamp_3-11-a63f40f6c4a022c16f8cf810e3b7ed2a
new file mode 100644
index 0000000000000..d7ff6cd63d9f0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-11-a63f40f6c4a022c16f8cf810e3b7ed2a
@@ -0,0 +1 @@
+2011-04-29 20:46:56.4485
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-12-165256158e3db1ce19c3c9db3c8011d2 b/sql/hive/src/test/resources/golden/timestamp_3-12-165256158e3db1ce19c3c9db3c8011d2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-3-6143888a940bfcac1133330764f5a31a b/sql/hive/src/test/resources/golden/timestamp_3-3-6143888a940bfcac1133330764f5a31a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-4-935d0d2492beab99bbbba26ba62a1db4 b/sql/hive/src/test/resources/golden/timestamp_3-4-935d0d2492beab99bbbba26ba62a1db4
new file mode 100644
index 0000000000000..27ba77ddaf615
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-4-935d0d2492beab99bbbba26ba62a1db4
@@ -0,0 +1 @@
+true
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-5-8fe348d5d9b9903a26eda32d308b8e41 b/sql/hive/src/test/resources/golden/timestamp_3-5-8fe348d5d9b9903a26eda32d308b8e41
new file mode 100644
index 0000000000000..21e72e8ac3d7e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-5-8fe348d5d9b9903a26eda32d308b8e41
@@ -0,0 +1 @@
+48
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-6-6be5fe01c502cd24db32a3781c97a703 b/sql/hive/src/test/resources/golden/timestamp_3-6-6be5fe01c502cd24db32a3781c97a703
new file mode 100644
index 0000000000000..ee3be2941da6e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-6-6be5fe01c502cd24db32a3781c97a703
@@ -0,0 +1 @@
+-31184
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-7-6066ba0451cd0fcfac4bea6376e72add b/sql/hive/src/test/resources/golden/timestamp_3-7-6066ba0451cd0fcfac4bea6376e72add
new file mode 100644
index 0000000000000..1cf1952ac0372
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-7-6066ba0451cd0fcfac4bea6376e72add
@@ -0,0 +1 @@
+1304135216
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-8-22e03daa775eab145d39ec0730953f7e b/sql/hive/src/test/resources/golden/timestamp_3-8-22e03daa775eab145d39ec0730953f7e
new file mode 100644
index 0000000000000..1cf1952ac0372
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-8-22e03daa775eab145d39ec0730953f7e
@@ -0,0 +1 @@
+1304135216
diff --git a/sql/hive/src/test/resources/golden/timestamp_3-9-ffc79abb874323e165963aa39f460a9b b/sql/hive/src/test/resources/golden/timestamp_3-9-ffc79abb874323e165963aa39f460a9b
new file mode 100644
index 0000000000000..d21deca762237
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_3-9-ffc79abb874323e165963aa39f460a9b
@@ -0,0 +1 @@
+1.30413517E9
diff --git a/sql/hive/src/test/resources/golden/timestamp_lazy-2-cdb72e0c24fd9277a41fe0c7b1392e34 b/sql/hive/src/test/resources/golden/timestamp_lazy-2-cdb72e0c24fd9277a41fe0c7b1392e34
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/timestamp_lazy-3-79e0c72c4fb3b259dfbffd245ccaa636 b/sql/hive/src/test/resources/golden/timestamp_lazy-3-79e0c72c4fb3b259dfbffd245ccaa636
new file mode 100644
index 0000000000000..e6bfe0b1667ae
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_lazy-3-79e0c72c4fb3b259dfbffd245ccaa636
@@ -0,0 +1,5 @@
+2011-01-01 01:01:01	165	val_165
+2011-01-01 01:01:01	238	val_238
+2011-01-01 01:01:01	27	val_27
+2011-01-01 01:01:01	311	val_311
+2011-01-01 01:01:01	86	val_86
diff --git a/sql/hive/src/test/resources/golden/timestamp_lazy-4-b4c4417ce9f08baeb82ffde6ef1baa25 b/sql/hive/src/test/resources/golden/timestamp_lazy-4-b4c4417ce9f08baeb82ffde6ef1baa25
new file mode 100644
index 0000000000000..e6bfe0b1667ae
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_lazy-4-b4c4417ce9f08baeb82ffde6ef1baa25
@@ -0,0 +1,5 @@
+2011-01-01 01:01:01	165	val_165
+2011-01-01 01:01:01	238	val_238
+2011-01-01 01:01:01	27	val_27
+2011-01-01 01:01:01	311	val_311
+2011-01-01 01:01:01	86	val_86
diff --git a/sql/hive/src/test/resources/golden/timestamp_null-3-222c5ea127c747c71738b5dc5b80459c b/sql/hive/src/test/resources/golden/timestamp_null-3-222c5ea127c747c71738b5dc5b80459c
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_null-3-222c5ea127c747c71738b5dc5b80459c
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/timestamp_null-4-ffc86f5c714eceabc36e92931b96beb0 b/sql/hive/src/test/resources/golden/timestamp_null-4-ffc86f5c714eceabc36e92931b96beb0
new file mode 100644
index 0000000000000..7951defec192a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/timestamp_null-4-ffc86f5c714eceabc36e92931b96beb0
@@ -0,0 +1 @@
+NULL
diff --git a/sql/hive/src/test/resources/golden/udf_equal-0-36b6cdf7c5f68c91155569b1622f5876 b/sql/hive/src/test/resources/golden/udf_equal-0-36b6cdf7c5f68c91155569b1622f5876
new file mode 100644
index 0000000000000..9b9b6312a269a
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-0-36b6cdf7c5f68c91155569b1622f5876
@@ -0,0 +1 @@
+a = b - Returns TRUE if a equals b and false otherwise
diff --git a/sql/hive/src/test/resources/golden/udf_equal-1-2422b50b96502dde8b661acdfebd8892 b/sql/hive/src/test/resources/golden/udf_equal-1-2422b50b96502dde8b661acdfebd8892
new file mode 100644
index 0000000000000..30fdf50f62e4e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-1-2422b50b96502dde8b661acdfebd8892
@@ -0,0 +1,2 @@
+a = b - Returns TRUE if a equals b and false otherwise
+Synonyms: ==
diff --git a/sql/hive/src/test/resources/golden/udf_equal-2-e0faab0f5e736c24bcc5503aeac55053 b/sql/hive/src/test/resources/golden/udf_equal-2-e0faab0f5e736c24bcc5503aeac55053
new file mode 100644
index 0000000000000..d6b4c860778b7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-2-e0faab0f5e736c24bcc5503aeac55053
@@ -0,0 +1 @@
+a == b - Returns TRUE if a equals b and false otherwise
diff --git a/sql/hive/src/test/resources/golden/udf_equal-3-39d8d6f197803de927f0af5409ec2f33 b/sql/hive/src/test/resources/golden/udf_equal-3-39d8d6f197803de927f0af5409ec2f33
new file mode 100644
index 0000000000000..71e55d6d638a6
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-3-39d8d6f197803de927f0af5409ec2f33
@@ -0,0 +1,2 @@
+a == b - Returns TRUE if a equals b and false otherwise
+Synonyms: =
diff --git a/sql/hive/src/test/resources/golden/udf_equal-4-94ac2476006425e1b3bcddf29ad07b16 b/sql/hive/src/test/resources/golden/udf_equal-4-94ac2476006425e1b3bcddf29ad07b16
new file mode 100644
index 0000000000000..015c417bc68f0
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-4-94ac2476006425e1b3bcddf29ad07b16
@@ -0,0 +1 @@
+false	false	true	true	NULL	NULL	NULL	NULL	NULL
diff --git a/sql/hive/src/test/resources/golden/udf_equal-5-878650cf21e9360a07d204c8ffb0cde7 b/sql/hive/src/test/resources/golden/udf_equal-5-878650cf21e9360a07d204c8ffb0cde7
new file mode 100644
index 0000000000000..aa7b4b51edea7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-5-878650cf21e9360a07d204c8ffb0cde7
@@ -0,0 +1 @@
+a <=> b - Returns same result with EQUAL(=) operator for non-null operands, but returns TRUE if both are NULL, FALSE if one of the them is NULL
diff --git a/sql/hive/src/test/resources/golden/udf_equal-6-1635ef051fecdfc7891d9f5a9a3a545e b/sql/hive/src/test/resources/golden/udf_equal-6-1635ef051fecdfc7891d9f5a9a3a545e
new file mode 100644
index 0000000000000..aa7b4b51edea7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-6-1635ef051fecdfc7891d9f5a9a3a545e
@@ -0,0 +1 @@
+a <=> b - Returns same result with EQUAL(=) operator for non-null operands, but returns TRUE if both are NULL, FALSE if one of the them is NULL
diff --git a/sql/hive/src/test/resources/golden/udf_equal-7-78f1b96c199e307714fa1b804e5bae27 b/sql/hive/src/test/resources/golden/udf_equal-7-78f1b96c199e307714fa1b804e5bae27
new file mode 100644
index 0000000000000..05292fb23192d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_equal-7-78f1b96c199e307714fa1b804e5bae27
@@ -0,0 +1 @@
+false	false	true	true	true	false	false	false	false
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-0-d555c8cd733572bfa8cd3362da9480cb b/sql/hive/src/test/resources/golden/udf_unix_timestamp-0-d555c8cd733572bfa8cd3362da9480cb
new file mode 100644
index 0000000000000..9913d42ffc1aa
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unix_timestamp-0-d555c8cd733572bfa8cd3362da9480cb
@@ -0,0 +1 @@
+unix_timestamp([date[, pattern]]) - Returns the UNIX timestamp
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-1-8a9dbadae706047715cf5f903ff4a724 b/sql/hive/src/test/resources/golden/udf_unix_timestamp-1-8a9dbadae706047715cf5f903ff4a724
new file mode 100644
index 0000000000000..ef4aa8e9595d1
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unix_timestamp-1-8a9dbadae706047715cf5f903ff4a724
@@ -0,0 +1,2 @@
+unix_timestamp([date[, pattern]]) - Returns the UNIX timestamp
+Converts the current or specified time to number of seconds since 1970-01-01.
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-2-28c40e51e55bed62693e626efda5d9c5 b/sql/hive/src/test/resources/golden/udf_unix_timestamp-2-28c40e51e55bed62693e626efda5d9c5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-3-732b21d386f2002b87eaf02d0b9951ed b/sql/hive/src/test/resources/golden/udf_unix_timestamp-3-732b21d386f2002b87eaf02d0b9951ed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-4-b2e42ebb75cecf09961d36587797f6d0 b/sql/hive/src/test/resources/golden/udf_unix_timestamp-4-b2e42ebb75cecf09961d36587797f6d0
new file mode 100644
index 0000000000000..31aaa952b4cc5
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unix_timestamp-4-b2e42ebb75cecf09961d36587797f6d0
@@ -0,0 +1 @@
+2009-03-20 11:30:01	1237573801
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-5-31243f5cb64356425b9f95ba011ac9d6 b/sql/hive/src/test/resources/golden/udf_unix_timestamp-5-31243f5cb64356425b9f95ba011ac9d6
new file mode 100644
index 0000000000000..6d9ee690cea4f
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unix_timestamp-5-31243f5cb64356425b9f95ba011ac9d6
@@ -0,0 +1 @@
+2009-03-20	1237532400
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-6-9b0f20bde1aaf9102b67a5498b167f31 b/sql/hive/src/test/resources/golden/udf_unix_timestamp-6-9b0f20bde1aaf9102b67a5498b167f31
new file mode 100644
index 0000000000000..e1d0cbb6c8495
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unix_timestamp-6-9b0f20bde1aaf9102b67a5498b167f31
@@ -0,0 +1 @@
+2009 Mar 20 11:30:01 am	1237573801
diff --git a/sql/hive/src/test/resources/golden/udf_unix_timestamp-7-47f433ff6ccce4c666440cc1a228a96d b/sql/hive/src/test/resources/golden/udf_unix_timestamp-7-47f433ff6ccce4c666440cc1a228a96d
new file mode 100644
index 0000000000000..6b40e687afd5d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unix_timestamp-7-47f433ff6ccce4c666440cc1a228a96d
@@ -0,0 +1 @@
+random_string	NULL
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 11d8b1f0a3d96..95921c3d7ae09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -51,9 +51,9 @@ class QueryTest extends FunSuite {
         fail(
           s"""
             |Exception thrown while executing query:
-            |${rdd.logicalPlan}
+            |${rdd.queryExecution}
             |== Exception ==
-            |$e
+            |${stackTraceToString(e)}
           """.stripMargin)
     }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 3132d0112c708..188579edd7bdd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -23,6 +23,8 @@ import org.apache.spark.sql.hive.execution.HiveComparisonTest
 import org.apache.spark.sql.hive.test.TestHive
 
 class CachedTableSuite extends HiveComparisonTest {
+  import TestHive._
+
   TestHive.loadTestTable("src")
 
   test("cache table") {
@@ -32,6 +34,20 @@ class CachedTableSuite extends HiveComparisonTest {
   createQueryTest("read from cached table",
     "SELECT * FROM src LIMIT 1", reset = false)
 
+  test("Drop cached table") {
+    sql("CREATE TABLE test(a INT)")
+    cacheTable("test")
+    sql("SELECT * FROM test").collect()
+    sql("DROP TABLE test")
+    intercept[org.apache.hadoop.hive.ql.metadata.InvalidTableException] {
+      sql("SELECT * FROM test").collect()
+    }
+  }
+
+  test("DROP nonexistant table") {
+    sql("DROP TABLE IF EXISTS nonexistantTable")
+  }
+
   test("check that table is cached and uncache") {
     TestHive.table("src").queryExecution.analyzed match {
       case _ : InMemoryRelation => // Found evidence of caching
@@ -58,14 +74,14 @@ class CachedTableSuite extends HiveComparisonTest {
   }
 
   test("'CACHE TABLE' and 'UNCACHE TABLE' HiveQL statement") {
-    TestHive.hql("CACHE TABLE src")
+    TestHive.sql("CACHE TABLE src")
     TestHive.table("src").queryExecution.executedPlan match {
       case _: InMemoryColumnarTableScan => // Found evidence of caching
       case _ => fail(s"Table 'src' should be cached")
     }
     assert(TestHive.isCached("src"), "Table 'src' should be cached")
 
-    TestHive.hql("UNCACHE TABLE src")
+    TestHive.sql("UNCACHE TABLE src")
     TestHive.table("src").queryExecution.executedPlan match {
       case _: InMemoryColumnarTableScan => fail(s"Table 'src' should not be cached")
       case _ => // Found evidence of uncaching
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 833f3502154f3..7e323146f9da2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -28,7 +28,7 @@ case class TestData(key: Int, value: String)
 class InsertIntoHiveTableSuite extends QueryTest {
   val testData = TestHive.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString)))
-  testData.registerAsTable("testData")
+  testData.registerTempTable("testData")
 
   test("insertInto() HiveTable") {
     createTable[TestData]("createAndInsertTest")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
new file mode 100644
index 0000000000000..bf5931bbf97ee
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.{SQLConf, QueryTest}
+import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+
+class StatisticsSuite extends QueryTest {
+
+  test("analyze MetastoreRelations") {
+    def queryTotalSize(tableName: String): BigInt =
+      catalog.lookupRelation(None, tableName).statistics.sizeInBytes
+
+    // Non-partitioned table
+    sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
+    sql("INSERT INTO TABLE analyzeTable SELECT * FROM src").collect()
+    sql("INSERT INTO TABLE analyzeTable SELECT * FROM src").collect()
+
+    assert(queryTotalSize("analyzeTable") === defaultSizeInBytes)
+
+    analyze("analyzeTable")
+
+    assert(queryTotalSize("analyzeTable") === BigInt(11624))
+
+    sql("DROP TABLE analyzeTable").collect()
+
+    // Partitioned table
+    sql(
+      """
+        |CREATE TABLE analyzeTable_part (key STRING, value STRING) PARTITIONED BY (ds STRING)
+      """.stripMargin).collect()
+    sql(
+      """
+        |INSERT INTO TABLE analyzeTable_part PARTITION (ds='2010-01-01')
+        |SELECT * FROM src
+      """.stripMargin).collect()
+    sql(
+      """
+        |INSERT INTO TABLE analyzeTable_part PARTITION (ds='2010-01-02')
+        |SELECT * FROM src
+      """.stripMargin).collect()
+    sql(
+      """
+        |INSERT INTO TABLE analyzeTable_part PARTITION (ds='2010-01-03')
+        |SELECT * FROM src
+      """.stripMargin).collect()
+
+    assert(queryTotalSize("analyzeTable_part") === defaultSizeInBytes)
+
+    analyze("analyzeTable_part")
+
+    assert(queryTotalSize("analyzeTable_part") === BigInt(17436))
+
+    sql("DROP TABLE analyzeTable_part").collect()
+
+    // Try to analyze a temp table
+    sql("""SELECT * FROM src""").registerTempTable("tempTable")
+    intercept[NotImplementedError] {
+      analyze("tempTable")
+    }
+    catalog.unregisterTable(None, "tempTable")
+  }
+
+  test("estimates the size of a test MetastoreRelation") {
+    val rdd = sql("""SELECT * FROM src""")
+    val sizes = rdd.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
+      mr.statistics.sizeInBytes
+    }
+    assert(sizes.size === 1)
+    assert(sizes(0).equals(BigInt(5812)),
+      s"expected exact size 5812 for test table 'src', got: ${sizes(0)}")
+  }
+
+  test("auto converts to broadcast hash join, by size estimate of a relation") {
+    def mkTest(
+        before: () => Unit,
+        after: () => Unit,
+        query: String,
+        expectedAnswer: Seq[Any],
+        ct: ClassTag[_]) = {
+      before()
+
+      var rdd = sql(query)
+
+      // Assert src has a size smaller than the threshold.
+      val sizes = rdd.queryExecution.analyzed.collect {
+        case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
+      }
+      assert(sizes.size === 2 && sizes(0) <= autoBroadcastJoinThreshold,
+        s"query should contain two relations, each of which has size smaller than autoConvertSize")
+
+      // Using `sparkPlan` because for relevant patterns in HashJoin to be
+      // matched, other strategies need to be applied.
+      var bhj = rdd.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
+      assert(bhj.size === 1,
+        s"actual query plans do not contain broadcast join: ${rdd.queryExecution}")
+
+      checkAnswer(rdd, expectedAnswer) // check correctness of output
+
+      TestHive.settings.synchronized {
+        val tmp = autoBroadcastJoinThreshold
+
+        sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1""")
+        rdd = sql(query)
+        bhj = rdd.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
+        assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
+
+        val shj = rdd.queryExecution.sparkPlan.collect { case j: ShuffledHashJoin => j }
+        assert(shj.size === 1,
+          "ShuffledHashJoin should be planned when BroadcastHashJoin is turned off")
+
+        sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=$tmp""")
+      }
+
+      after()
+    }
+
+    /** Tests for MetastoreRelation */
+    val metastoreQuery = """SELECT * FROM src a JOIN src b ON a.key = 238 AND a.key = b.key"""
+    val metastoreAnswer = Seq.fill(4)((238, "val_238", 238, "val_238"))
+    mkTest(
+      () => (),
+      () => (),
+      metastoreQuery,
+      metastoreAnswer,
+      implicitly[ClassTag[MetastoreRelation]]
+    )
+  }
+
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
index 10c8069a624e6..9644b707eb1a0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
@@ -40,7 +40,7 @@ class JavaHiveQLSuite extends FunSuite {
 
   ignore("SELECT * FROM src") {
     assert(
-      javaHiveCtx.hql("SELECT * FROM src").collect().map(_.getInt(0)) ===
+      javaHiveCtx.sql("SELECT * FROM src").collect().map(_.getInt(0)) ===
         TestHive.sql("SELECT * FROM src").collect().map(_.getInt(0)).toSeq)
   }
 
@@ -56,33 +56,34 @@ class JavaHiveQLSuite extends FunSuite {
     val tableName = "test_native_commands"
 
     assertResult(0) {
-      javaHiveCtx.hql(s"DROP TABLE IF EXISTS $tableName").count()
+      javaHiveCtx.sql(s"DROP TABLE IF EXISTS $tableName").count()
     }
 
     assertResult(0) {
-      javaHiveCtx.hql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
+      javaHiveCtx.sql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
     }
 
-    javaHiveCtx.hql("SHOW TABLES").registerAsTable("show_tables")
+    javaHiveCtx.sql("SHOW TABLES").registerTempTable("show_tables")
 
     assert(
       javaHiveCtx
-        .hql("SELECT result FROM show_tables")
+        .sql("SELECT result FROM show_tables")
         .collect()
         .map(_.getString(0))
         .contains(tableName))
 
     assertResult(Array(Array("key", "int", "None"), Array("value", "string", "None"))) {
-      javaHiveCtx.hql(s"DESCRIBE $tableName").registerAsTable("describe_table")
+      javaHiveCtx.sql(s"DESCRIBE $tableName").registerTempTable("describe_table")
+
 
       javaHiveCtx
-        .hql("SELECT result FROM describe_table")
+        .sql("SELECT result FROM describe_table")
         .collect()
         .map(_.getString(0).split("\t").map(_.trim))
         .toArray
     }
 
-    assert(isExplanation(javaHiveCtx.hql(
+    assert(isExplanation(javaHiveCtx.sql(
       s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
 
     TestHive.reset()
@@ -90,7 +91,7 @@ class JavaHiveQLSuite extends FunSuite {
 
   ignore("Exactly once semantics for DDL and command statements") {
     val tableName = "test_exactly_once"
-    val q0 = javaHiveCtx.hql(s"CREATE TABLE $tableName(key INT, value STRING)")
+    val q0 = javaHiveCtx.sql(s"CREATE TABLE $tableName(key INT, value STRING)")
 
     // If the table was not created, the following assertion would fail
     assert(Try(TestHive.table(tableName)).isSuccess)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 08ef4d9b6bb93..0ebaf6ffd5458 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -21,7 +21,7 @@ import java.io._
 
 import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen}
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.{NativeCommand => LogicalNativeCommand}
@@ -132,7 +132,7 @@ abstract class HiveComparisonTest
     answer: Seq[String]): Seq[String] = {
 
     def isSorted(plan: LogicalPlan): Boolean = plan match {
-      case _: Join | _: Aggregate | _: BaseRelation | _: Generate | _: Sample | _: Distinct => false
+      case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false
       case PhysicalOperation(_, _, Sort(_, _)) => true
       case _ => plan.children.iterator.exists(isSorted)
     }
@@ -197,7 +197,7 @@ abstract class HiveComparisonTest
     // If test sharding is enable, skip tests that are not in the correct shard.
     shardInfo.foreach {
       case (shardId, numShards) if testCaseName.hashCode % numShards != shardId => return
-      case (shardId, _) => logger.debug(s"Shard $shardId includes test '$testCaseName'")
+      case (shardId, _) => logDebug(s"Shard $shardId includes test '$testCaseName'")
     }
 
     // Skip tests found in directories specified by user.
@@ -213,13 +213,13 @@ abstract class HiveComparisonTest
         .map(new File(_, testCaseName))
         .filter(_.exists)
     if (runOnlyDirectories.nonEmpty && runIndicators.isEmpty) {
-      logger.debug(
+      logDebug(
         s"Skipping test '$testCaseName' not found in ${runOnlyDirectories.map(_.getCanonicalPath)}")
       return
     }
 
     test(testCaseName) {
-      logger.debug(s"=== HIVE TEST: $testCaseName ===")
+      logDebug(s"=== HIVE TEST: $testCaseName ===")
 
       // Clear old output for this testcase.
       outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
@@ -235,19 +235,19 @@ abstract class HiveComparisonTest
           .filterNot(_ contains "hive.outerjoin.supports.filters")
 
       if (allQueries != queryList)
-        logger.warn(s"Simplifications made on unsupported operations for test $testCaseName")
+        logWarning(s"Simplifications made on unsupported operations for test $testCaseName")
 
       lazy val consoleTestCase = {
         val quotes = "\"\"\""
         queryList.zipWithIndex.map {
           case (query, i) =>
-            s"""val q$i = hql($quotes$query$quotes); q$i.collect()"""
+            s"""val q$i = sql($quotes$query$quotes); q$i.collect()"""
         }.mkString("\n== Console version of this test ==\n", "\n", "\n")
       }
 
       try {
         // MINOR HACK: You must run a query before calling reset the first time.
-        TestHive.hql("SHOW TABLES")
+        TestHive.sql("SHOW TABLES")
         if (reset) { TestHive.reset() }
 
         val hiveCacheFiles = queryList.zipWithIndex.map {
@@ -257,11 +257,11 @@ abstract class HiveComparisonTest
         }
 
         val hiveCachedResults = hiveCacheFiles.flatMap { cachedAnswerFile =>
-          logger.debug(s"Looking for cached answer file $cachedAnswerFile.")
+          logDebug(s"Looking for cached answer file $cachedAnswerFile.")
           if (cachedAnswerFile.exists) {
             Some(fileToString(cachedAnswerFile))
           } else {
-            logger.debug(s"File $cachedAnswerFile not found")
+            logDebug(s"File $cachedAnswerFile not found")
             None
           }
         }.map {
@@ -272,7 +272,7 @@ abstract class HiveComparisonTest
 
         val hiveResults: Seq[Seq[String]] =
           if (hiveCachedResults.size == queryList.size) {
-            logger.info(s"Using answer cache for test: $testCaseName")
+            logInfo(s"Using answer cache for test: $testCaseName")
             hiveCachedResults
           } else {
 
@@ -287,7 +287,7 @@ abstract class HiveComparisonTest
                   if (installHooksCommand.findAllMatchIn(queryString).nonEmpty)
                     sys.error("hive exec hooks not supported for tests.")
 
-                  logger.warn(s"Running query ${i+1}/${queryList.size} with hive.")
+                  logWarning(s"Running query ${i+1}/${queryList.size} with hive.")
                   // Analyze the query with catalyst to ensure test tables are loaded.
                   val answer = hiveQuery.analyzed match {
                     case _: ExplainCommand => Nil // No need to execute EXPLAIN queries as we don't check the output.
@@ -350,14 +350,8 @@ abstract class HiveComparisonTest
 
               val resultComparison = sideBySide(hivePrintOut, catalystPrintOut).mkString("\n")
 
-              println("hive output")
-              hive.foreach(println)
-
-              println("catalyst printout")
-              catalyst.foreach(println)
-
               if (recomputeCache) {
-                logger.warn(s"Clearing cache files for failed test $testCaseName")
+                logWarning(s"Clearing cache files for failed test $testCaseName")
                 hiveCacheFiles.foreach(_.delete())
               }
 
@@ -386,7 +380,7 @@ abstract class HiveComparisonTest
               TestHive.runSqlHive("SELECT key FROM src")
             } catch {
               case e: Exception =>
-                logger.error(s"FATAL ERROR: Canary query threw $e This implies that the testing environment has likely been corrupted.")
+                logError(s"FATAL ERROR: Canary query threw $e This implies that the testing environment has likely been corrupted.")
                 // The testing setup traps exits so wait here for a long time so the developer can see when things started
                 // to go wrong.
                 Thread.sleep(1000000)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
index 50ab71a9003d3..02518d516261b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
@@ -53,7 +53,7 @@ abstract class HiveQueryFileTest extends HiveComparisonTest {
   testCases.sorted.foreach {
     case (testCaseName, testCaseFile) =>
       if (blackList.map(_.r.pattern.matcher(testCaseName).matches()).reduceLeft(_||_)) {
-        logger.debug(s"Blacklisted test skipped $testCaseName")
+        logDebug(s"Blacklisted test skipped $testCaseName")
       } else if (realWhiteList.map(_.r.pattern.matcher(testCaseName).matches()).reduceLeft(_||_) || runAll) {
         // Build a test case and submit it to scala test framework...
         val queriesString = fileToString(testCaseFile)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 9f1cd703103ec..aa810a291231a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -19,9 +19,11 @@ package org.apache.spark.sql.hive.execution
 
 import scala.util.Try
 
+import org.apache.spark.sql.{SchemaRDD, Row}
+import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.{SchemaRDD, Row}
+import org.apache.spark.sql.{Row, SchemaRDD}
 
 case class TestData(a: Int, b: String)
 
@@ -30,9 +32,33 @@ case class TestData(a: Int, b: String)
  */
 class HiveQuerySuite extends HiveComparisonTest {
 
+  createQueryTest("single case",
+    """SELECT case when true then 1 else 2 end FROM src LIMIT 1""")
+
+  createQueryTest("double case",
+    """SELECT case when 1 = 2 then 1 when 2 = 2 then 3 else 2 end FROM src LIMIT 1""")
+
+  createQueryTest("case else null",
+    """SELECT case when 1 = 2 then 1 when 2 = 2 then 3 else null end FROM src LIMIT 1""")
+
+  createQueryTest("having no references",
+    "SELECT key FROM src GROUP BY key HAVING COUNT(*) > 1")
+
+  createQueryTest("boolean = number",
+    """
+      |SELECT
+      |  1 = true, 1L = true, 1Y = true, true = 1, true = 1L, true = 1Y,
+      |  0 = true, 0L = true, 0Y = true, true = 0, true = 0L, true = 0Y,
+      |  1 = false, 1L = false, 1Y = false, false = 1, false = 1L, false = 1Y,
+      |  0 = false, 0L = false, 0Y = false, false = 0, false = 0L, false = 0Y,
+      |  2 = true, 2L = true, 2Y = true, true = 2, true = 2L, true = 2Y,
+      |  2 = false, 2L = false, 2Y = false, false = 2, false = 2L, false = 2Y
+      |FROM src LIMIT 1
+    """.stripMargin)
+
   test("CREATE TABLE AS runs once") {
-    hql("CREATE TABLE foo AS SELECT 1 FROM src LIMIT 1").collect()
-    assert(hql("SELECT COUNT(*) FROM foo").collect().head.getLong(0) === 1,
+    sql("CREATE TABLE foo AS SELECT 1 FROM src LIMIT 1").collect()
+    assert(sql("SELECT COUNT(*) FROM foo").collect().head.getLong(0) === 1,
       "Incorrect number of rows in created table")
   }
 
@@ -40,15 +66,20 @@ class HiveQuerySuite extends HiveComparisonTest {
     "SELECT * FROM src WHERE key Between 1 and 2")
 
   createQueryTest("div",
-    "SELECT 1 DIV 2, 1 div 2, 1 dIv 2 FROM src LIMIT 1")
+    "SELECT 1 DIV 2, 1 div 2, 1 dIv 2, 100 DIV 51, 100 DIV 49 FROM src LIMIT 1")
+
+  createQueryTest("division",
+    "SELECT 2 / 1, 1 / 2, 1 / 3, 1 / COUNT(*) FROM src LIMIT 1")
 
   test("Query expressed in SQL") {
+    set("spark.sql.dialect", "sql")
     assert(sql("SELECT 1").collect() === Array(Seq(1)))
+    set("spark.sql.dialect", "hiveql")
+
   }
 
   test("Query expressed in HiveQL") {
-    hql("FROM src SELECT key").collect()
-    hiveql("FROM src SELECT key").collect()
+    sql("FROM src SELECT key").collect()
   }
 
   createQueryTest("Constant Folding Optimization for AVG_SUM_COUNT",
@@ -164,12 +195,12 @@ class HiveQuerySuite extends HiveComparisonTest {
     "SELECT * FROM src LATERAL VIEW explode(map(key+3,key+4)) D as k, v")
 
   test("sampling") {
-    hql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s")
+    sql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s")
   }
 
   test("SchemaRDD toString") {
-    hql("SHOW TABLES").toString
-    hql("SELECT * FROM src").toString
+    sql("SHOW TABLES").toString
+    sql("SELECT * FROM src").toString
   }
 
   createQueryTest("case statements with key #1",
@@ -197,8 +228,8 @@ class HiveQuerySuite extends HiveComparisonTest {
     "SELECT (CASE WHEN key > 2 THEN 3 WHEN 2 > key THEN 2 ELSE 0 END) FROM src WHERE key < 15")
 
   test("implement identity function using case statement") {
-    val actual = hql("SELECT (CASE key WHEN key THEN key END) FROM src").collect().toSet
-    val expected = hql("SELECT key FROM src").collect().toSet
+    val actual = sql("SELECT (CASE key WHEN key THEN key END) FROM src").collect().toSet
+    val expected = sql("SELECT key FROM src").collect().toSet
     assert(actual === expected)
   }
 
@@ -206,7 +237,23 @@ class HiveQuerySuite extends HiveComparisonTest {
   // See https://github.com/apache/spark/pull/1055#issuecomment-45820167 for a discussion.
   ignore("non-boolean conditions in a CaseWhen are illegal") {
     intercept[Exception] {
-      hql("SELECT (CASE WHEN key > 2 THEN 3 WHEN 1 THEN 2 ELSE 0 END) FROM src").collect()
+      sql("SELECT (CASE WHEN key > 2 THEN 3 WHEN 1 THEN 2 ELSE 0 END) FROM src").collect()
+    }
+  }
+
+  createQueryTest("case sensitivity: Hive table",
+    "SELECT srcalias.KEY, SRCALIAS.value FROM sRc SrCAlias WHERE SrCAlias.kEy < 15")
+
+  test("case sensitivity: registered table") {
+    val testData: SchemaRDD =
+      TestHive.sparkContext.parallelize(
+        TestData(1, "str1") ::
+        TestData(2, "str2") :: Nil)
+    testData.registerTempTable("REGisteredTABle")
+
+    assertResult(Array(Array(2, "str2"))) {
+      sql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " +
+        "WHERE TableAliaS.a > 1").collect()
     }
   }
 
@@ -216,9 +263,9 @@ class HiveQuerySuite extends HiveComparisonTest {
   }
 
   test("SPARK-1704: Explain commands as a SchemaRDD") {
-    hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
 
-    val rdd = hql("explain select key, count(value) from src group by key")
+    val rdd = sql("explain select key, count(value) from src group by key")
     assert(isExplanation(rdd))
 
     TestHive.reset()
@@ -227,9 +274,9 @@ class HiveQuerySuite extends HiveComparisonTest {
   test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") {
     val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3))
       .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)}
-    TestHive.sparkContext.parallelize(fixture).registerAsTable("having_test")
+    TestHive.sparkContext.parallelize(fixture).registerTempTable("having_test")
     val results =
-      hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3")
+      sql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3")
       .collect()
       .map(x => Pair(x.getString(0), x.getInt(1)))
 
@@ -238,39 +285,39 @@ class HiveQuerySuite extends HiveComparisonTest {
   }
 
   test("SPARK-2180: HAVING with non-boolean clause raises no exceptions") {
-    hql("select key, count(*) c from src group by key having c").collect()
+    sql("select key, count(*) c from src group by key having c").collect()
   }
 
   test("SPARK-2225: turn HAVING without GROUP BY into a simple filter") {
-    assert(hql("select key from src having key > 490").collect().size < 100)
+    assert(sql("select key from src having key > 490").collect().size < 100)
   }
 
   test("Query Hive native command execution result") {
     val tableName = "test_native_commands"
 
     assertResult(0) {
-      hql(s"DROP TABLE IF EXISTS $tableName").count()
+      sql(s"DROP TABLE IF EXISTS $tableName").count()
     }
 
     assertResult(0) {
-      hql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
+      sql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
     }
 
     assert(
-      hql("SHOW TABLES")
+      sql("SHOW TABLES")
         .select('result)
         .collect()
         .map(_.getString(0))
         .contains(tableName))
 
-    assert(isExplanation(hql(s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
+    assert(isExplanation(sql(s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
 
     TestHive.reset()
   }
 
   test("Exactly once semantics for DDL and command statements") {
     val tableName = "test_exactly_once"
-    val q0 = hql(s"CREATE TABLE $tableName(key INT, value STRING)")
+    val q0 = sql(s"CREATE TABLE $tableName(key INT, value STRING)")
 
     // If the table was not created, the following assertion would fail
     assert(Try(table(tableName)).isSuccess)
@@ -280,9 +327,9 @@ class HiveQuerySuite extends HiveComparisonTest {
   }
 
   test("DESCRIBE commands") {
-    hql(s"CREATE TABLE test_describe_commands1 (key INT, value STRING) PARTITIONED BY (dt STRING)")
+    sql(s"CREATE TABLE test_describe_commands1 (key INT, value STRING) PARTITIONED BY (dt STRING)")
 
-    hql(
+    sql(
       """FROM src INSERT OVERWRITE TABLE test_describe_commands1 PARTITION (dt='2008-06-08')
         |SELECT key, value
       """.stripMargin)
@@ -297,7 +344,7 @@ class HiveQuerySuite extends HiveComparisonTest {
         Array("# col_name", "data_type", "comment"),
         Array("dt", "string", null))
     ) {
-      hql("DESCRIBE test_describe_commands1")
+      sql("DESCRIBE test_describe_commands1")
         .select('col_name, 'data_type, 'comment)
         .collect()
     }
@@ -312,14 +359,14 @@ class HiveQuerySuite extends HiveComparisonTest {
         Array("# col_name", "data_type", "comment"),
         Array("dt", "string", null))
     ) {
-      hql("DESCRIBE default.test_describe_commands1")
+      sql("DESCRIBE default.test_describe_commands1")
         .select('col_name, 'data_type, 'comment)
         .collect()
     }
 
     // Describe a column is a native command
     assertResult(Array(Array("value", "string", "from deserializer"))) {
-      hql("DESCRIBE test_describe_commands1 value")
+      sql("DESCRIBE test_describe_commands1 value")
         .select('result)
         .collect()
         .map(_.getString(0).split("\t").map(_.trim))
@@ -327,7 +374,7 @@ class HiveQuerySuite extends HiveComparisonTest {
 
     // Describe a column is a native command
     assertResult(Array(Array("value", "string", "from deserializer"))) {
-      hql("DESCRIBE default.test_describe_commands1 value")
+      sql("DESCRIBE default.test_describe_commands1 value")
         .select('result)
         .collect()
         .map(_.getString(0).split("\t").map(_.trim))
@@ -345,7 +392,7 @@ class HiveQuerySuite extends HiveComparisonTest {
         Array("", "", ""),
         Array("dt", "string", "None"))
     ) {
-      hql("DESCRIBE test_describe_commands1 PARTITION (dt='2008-06-08')")
+      sql("DESCRIBE test_describe_commands1 PARTITION (dt='2008-06-08')")
         .select('result)
         .collect()
         .map(_.getString(0).split("\t").map(_.trim))
@@ -356,7 +403,7 @@ class HiveQuerySuite extends HiveComparisonTest {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(1, "str2") :: Nil)
-    testData.registerAsTable("test_describe_commands2")
+    testData.registerTempTable("test_describe_commands2")
 
     assertResult(
       Array(
@@ -364,17 +411,17 @@ class HiveQuerySuite extends HiveComparisonTest {
         Array("a", "IntegerType", null),
         Array("b", "StringType", null))
     ) {
-      hql("DESCRIBE test_describe_commands2")
+      sql("DESCRIBE test_describe_commands2")
         .select('col_name, 'data_type, 'comment)
         .collect()
     }
   }
 
   test("SPARK-2263: Insert Map<K, V> values") {
-    hql("CREATE TABLE m(value MAP<INT, STRING>)")
-    hql("INSERT OVERWRITE TABLE m SELECT MAP(key, value) FROM src LIMIT 10")
-    hql("SELECT * FROM m").collect().zip(hql("SELECT * FROM src LIMIT 10").collect()).map {
-      case (Row(map: Map[Int, String]), Row(key: Int, value: String)) =>
+    sql("CREATE TABLE m(value MAP<INT, STRING>)")
+    sql("INSERT OVERWRITE TABLE m SELECT MAP(key, value) FROM src LIMIT 10")
+    sql("SELECT * FROM m").collect().zip(sql("SELECT * FROM src LIMIT 10").collect()).map {
+      case (Row(map: Map[_, _]), Row(key: Int, value: String)) =>
         assert(map.size === 1)
         assert(map.head === (key, value))
     }
@@ -385,18 +432,18 @@ class HiveQuerySuite extends HiveComparisonTest {
     val testKey = "spark.sql.key.usedfortestonly"
     val testVal = "val0,val_1,val2.3,my_table"
 
-    hql(s"set $testKey=$testVal")
+    sql(s"set $testKey=$testVal")
     assert(get(testKey, testVal + "_") == testVal)
 
-    hql("set mapred.reduce.tasks=20")
-    assert(get("mapred.reduce.tasks", "0") == "20")
-    hql("set mapred.reduce.tasks = 40")
-    assert(get("mapred.reduce.tasks", "0") == "40")
+    sql("set some.property=20")
+    assert(get("some.property", "0") == "20")
+    sql("set some.property = 40")
+    assert(get("some.property", "0") == "40")
 
-    hql(s"set $testKey=$testVal")
+    sql(s"set $testKey=$testVal")
     assert(get(testKey, "0") == testVal)
 
-    hql(s"set $testKey=")
+    sql(s"set $testKey=")
     assert(get(testKey, "0") == "")
   }
 
@@ -405,64 +452,62 @@ class HiveQuerySuite extends HiveComparisonTest {
     val testKey = "spark.sql.key.usedfortestonly"
     val testVal = "test.val.0"
     val nonexistentKey = "nonexistent"
-    def rowsToPairs(rows: Array[Row]) = rows.map { case Row(key: String, value: String) =>
-      key -> value
-    }
 
     clear()
 
     // "set" itself returns all config variables currently specified in SQLConf.
-    assert(hql("SET").collect().size == 0)
+    // TODO: Should we be listing the default here always? probably...
+    assert(sql("SET").collect().size == 0)
 
-    assertResult(Array(testKey -> testVal)) {
-      rowsToPairs(hql(s"SET $testKey=$testVal").collect())
+    assertResult(Array(s"$testKey=$testVal")) {
+      sql(s"SET $testKey=$testVal").collect().map(_.getString(0))
     }
 
     assert(hiveconf.get(testKey, "") == testVal)
-    assertResult(Array(testKey -> testVal)) {
-      rowsToPairs(hql("SET").collect())
+    assertResult(Array(s"$testKey=$testVal")) {
+      sql(s"SET $testKey=$testVal").collect().map(_.getString(0))
     }
 
-    hql(s"SET ${testKey + testKey}=${testVal + testVal}")
+    sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     assert(hiveconf.get(testKey + testKey, "") == testVal + testVal)
-    assertResult(Array(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
-      rowsToPairs(hql("SET").collect())
+    assertResult(Array(s"$testKey=$testVal", s"${testKey + testKey}=${testVal + testVal}")) {
+      sql(s"SET").collect().map(_.getString(0))
     }
 
     // "set key"
-    assertResult(Array(testKey -> testVal)) {
-      rowsToPairs(hql(s"SET $testKey").collect())
+    assertResult(Array(s"$testKey=$testVal")) {
+      sql(s"SET $testKey").collect().map(_.getString(0))
     }
 
-    assertResult(Array(nonexistentKey -> "<undefined>")) {
-      rowsToPairs(hql(s"SET $nonexistentKey").collect())
+    assertResult(Array(s"$nonexistentKey=<undefined>")) {
+      sql(s"SET $nonexistentKey").collect().map(_.getString(0))
     }
 
-    // Assert that sql() should have the same effects as hql() by repeating the above using sql().
+    // Assert that sql() should have the same effects as sql() by repeating the above using sql().
     clear()
     assert(sql("SET").collect().size == 0)
 
-    assertResult(Array(testKey -> testVal)) {
-      rowsToPairs(sql(s"SET $testKey=$testVal").collect())
+    assertResult(Array(s"$testKey=$testVal")) {
+      sql(s"SET $testKey=$testVal").collect().map(_.getString(0))
     }
 
     assert(hiveconf.get(testKey, "") == testVal)
-    assertResult(Array(testKey -> testVal)) {
-      rowsToPairs(sql("SET").collect())
+    assertResult(Array(s"$testKey=$testVal")) {
+      sql("SET").collect().map(_.getString(0))
     }
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     assert(hiveconf.get(testKey + testKey, "") == testVal + testVal)
-    assertResult(Array(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
-      rowsToPairs(sql("SET").collect())
+    assertResult(Array(s"$testKey=$testVal", s"${testKey + testKey}=${testVal + testVal}")) {
+      sql("SET").collect().map(_.getString(0))
     }
 
-    assertResult(Array(testKey -> testVal)) {
-      rowsToPairs(sql(s"SET $testKey").collect())
+    assertResult(Array(s"$testKey=$testVal")) {
+      sql(s"SET $testKey").collect().map(_.getString(0))
     }
 
-    assertResult(Array(nonexistentKey -> "<undefined>")) {
-      rowsToPairs(sql(s"SET $nonexistentKey").collect())
+    assertResult(Array(s"$nonexistentKey=<undefined>")) {
+      sql(s"SET $nonexistentKey").collect().map(_.getString(0))
     }
 
     clear()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 67594b57d3dfa..6b3ffd1c0ffe2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hive.execution
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 
-case class Data(a: Int, B: Int, n: Nested)
+case class Data(a: Int, B: Int, n: Nested, nestedArray: Seq[Nested])
 case class Nested(a: Int, B: Int)
 
 /**
@@ -53,10 +53,16 @@ class HiveResolutionSuite extends HiveComparisonTest {
 
   test("case insensitivity with scala reflection") {
     // Test resolution with Scala Reflection
-    TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2)) :: Nil)
-      .registerAsTable("caseSensitivityTest")
+    TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+      .registerTempTable("caseSensitivityTest")
 
-    hql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
+    sql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
+  }
+
+  test("nested repeated resolution") {
+    TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+      .registerTempTable("nestedRepeatedTest")
+    assert(sql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }
 
   /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
new file mode 100644
index 0000000000000..c5736723b47c0
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+class HiveTableScanSuite extends HiveComparisonTest {
+
+  createQueryTest("partition_based_table_scan_with_different_serde",
+    """
+      |CREATE TABLE part_scan_test (key STRING, value STRING) PARTITIONED BY (ds STRING)
+      |ROW FORMAT SERDE
+      |'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'
+      |STORED AS RCFILE;
+      |
+      |FROM src
+      |INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-01')
+      |SELECT 100,100 LIMIT 1;
+      |
+      |ALTER TABLE part_scan_test SET SERDE
+      |'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe';
+      |
+      |FROM src INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-02')
+      |SELECT 200,200 LIMIT 1;
+      |
+      |SELECT * from part_scan_test;
+    """.stripMargin)
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala
index 7436de264a1e1..c3c18cf8ccac3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala
@@ -35,7 +35,7 @@ class HiveTypeCoercionSuite extends HiveComparisonTest {
 
   test("[SPARK-2210] boolean cast on boolean value should be removed") {
     val q = "select cast(cast(key=0 as boolean) as boolean) from src"
-    val project = TestHive.hql(q).queryExecution.executedPlan.collect { case e: Project => e }.head
+    val project = TestHive.sql(q).queryExecution.executedPlan.collect { case e: Project => e }.head
 
     // No cast expression introduced
     project.transformAllExpressions { case c: Cast =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
index f944d010660eb..b6b8592344ef5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
@@ -37,7 +37,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject
  */
 class HiveUdfSuite extends HiveComparisonTest {
 
-  TestHive.hql(
+  TestHive.sql(
     """
       |CREATE EXTERNAL TABLE hiveUdfTestTable (
       |   pair STRUCT<id: INT, value: INT>
@@ -48,16 +48,16 @@ class HiveUdfSuite extends HiveComparisonTest {
     """.stripMargin.format(classOf[PairSerDe].getName)
   )
 
-  TestHive.hql(
+  TestHive.sql(
     "ALTER TABLE hiveUdfTestTable ADD IF NOT EXISTS PARTITION(partition='testUdf') LOCATION '%s'"
       .format(this.getClass.getClassLoader.getResource("data/files/testUdf").getFile)
   )
 
-  TestHive.hql("CREATE TEMPORARY FUNCTION testUdf AS '%s'".format(classOf[PairUdf].getName))
+  TestHive.sql("CREATE TEMPORARY FUNCTION testUdf AS '%s'".format(classOf[PairUdf].getName))
 
-  TestHive.hql("SELECT testUdf(pair) FROM hiveUdfTestTable")
+  TestHive.sql("SELECT testUdf(pair) FROM hiveUdfTestTable")
 
-  TestHive.hql("DROP TEMPORARY FUNCTION IF EXISTS testUdf")
+  TestHive.sql("DROP TEMPORARY FUNCTION IF EXISTS testUdf")
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index 34d8a061ccc83..1a6dbc0ce0c0d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -27,7 +27,7 @@ import scala.collection.JavaConversions._
  */
 class PruningSuite extends HiveComparisonTest {
   // MINOR HACK: You must run a query before calling reset the first time.
-  TestHive.hql("SHOW TABLES")
+  TestHive.sql("SHOW TABLES")
 
   // Column/partition pruning is not implemented for `InMemoryColumnarTableScan` yet, need to reset
   // the environment to ensure all referenced tables in this suites are not cached in-memory.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
index 91ad59d7f82c0..6f57fe8958387 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.parquet
 
+import java.io.File
+
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
 
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Row}
@@ -27,6 +29,8 @@ import org.apache.spark.util.Utils
 // Implicits
 import org.apache.spark.sql.hive.test.TestHive._
 
+case class Cases(lower: String, UPPER: String)
+
 class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
 
   val dirname = Utils.createTempDir()
@@ -35,9 +39,9 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
 
   override def beforeAll() {
     // write test data
-    ParquetTestData.writeFile
+    ParquetTestData.writeFile()
     testRDD = parquetFile(ParquetTestData.testDir.toString)
-    testRDD.registerAsTable("testsource")
+    testRDD.registerTempTable("testsource")
   }
 
   override def afterAll() {
@@ -55,35 +59,49 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
     Utils.deleteRecursively(dirname)
   }
 
+  test("Case insensitive attribute names") {
+    val tempFile = File.createTempFile("parquet", "")
+    tempFile.delete()
+    sparkContext.parallelize(1 to 10)
+      .map(_.toString)
+      .map(i => Cases(i, i))
+      .saveAsParquetFile(tempFile.getCanonicalPath)
+
+    parquetFile(tempFile.getCanonicalPath).registerTempTable("cases")
+    sql("SELECT upper FROM cases").collect().map(_.getString(0)) === (1 to 10).map(_.toString)
+    sql("SELECT LOWER FROM cases").collect().map(_.getString(0)) === (1 to 10).map(_.toString)
+  }
+
   test("SELECT on Parquet table") {
-    val rdd = hql("SELECT * FROM testsource").collect()
+    val rdd = sql("SELECT * FROM testsource").collect()
     assert(rdd != null)
     assert(rdd.forall(_.size == 6))
   }
 
   test("Simple column projection + filter on Parquet table") {
-    val rdd = hql("SELECT myboolean, mylong FROM testsource WHERE myboolean=true").collect()
+    val rdd = sql("SELECT myboolean, mylong FROM testsource WHERE myboolean=true").collect()
     assert(rdd.size === 5, "Filter returned incorrect number of rows")
     assert(rdd.forall(_.getBoolean(0)), "Filter returned incorrect Boolean field value")
   }
 
   test("Converting Hive to Parquet Table via saveAsParquetFile") {
-    hql("SELECT * FROM src").saveAsParquetFile(dirname.getAbsolutePath)
-    parquetFile(dirname.getAbsolutePath).registerAsTable("ptable")
-    val rddOne = hql("SELECT * FROM src").collect().sortBy(_.getInt(0))
-    val rddTwo = hql("SELECT * from ptable").collect().sortBy(_.getInt(0))
+    sql("SELECT * FROM src").saveAsParquetFile(dirname.getAbsolutePath)
+    parquetFile(dirname.getAbsolutePath).registerTempTable("ptable")
+    val rddOne = sql("SELECT * FROM src").collect().sortBy(_.getInt(0))
+    val rddTwo = sql("SELECT * from ptable").collect().sortBy(_.getInt(0))
+
     compareRDDs(rddOne, rddTwo, "src (Hive)", Seq("key:Int", "value:String"))
   }
 
   test("INSERT OVERWRITE TABLE Parquet table") {
-    hql("SELECT * FROM testsource").saveAsParquetFile(dirname.getAbsolutePath)
-    parquetFile(dirname.getAbsolutePath).registerAsTable("ptable")
+    sql("SELECT * FROM testsource").saveAsParquetFile(dirname.getAbsolutePath)
+    parquetFile(dirname.getAbsolutePath).registerTempTable("ptable")
     // let's do three overwrites for good measure
-    hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
-    val rddCopy = hql("SELECT * FROM ptable").collect()
-    val rddOrig = hql("SELECT * FROM testsource").collect()
+    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
+    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
+    sql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
+    val rddCopy = sql("SELECT * FROM ptable").collect()
+    val rddOrig = sql("SELECT * FROM testsource").collect()
     assert(rddCopy.size === rddOrig.size, "INSERT OVERWRITE changed size of table??")
     compareRDDs(rddOrig, rddCopy, "testsource", ParquetTestData.testSchemaFieldNames)
   }
diff --git a/streaming/pom.xml b/streaming/pom.xml
index f506d6ce34a6f..1072f74aea0d9 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -27,6 +27,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-streaming_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Streaming</name>
   <url>http://spark.apache.org/</url>
@@ -55,6 +58,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index ac56ff709c1c4..b780282bdac37 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -35,7 +35,6 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
   extends Logging with Serializable {
   val master = ssc.sc.master
   val framework = ssc.sc.appName
-  val sparkHome = ssc.sc.getSparkHome.getOrElse(null)
   val jars = ssc.sc.jars
   val graph = ssc.graph
   val checkpointDir = ssc.checkpointDir
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index c4bdf01fa3744..c00e11d11910f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -737,7 +737,8 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
 }
 
 object JavaPairDStream {
-  implicit def fromPairDStream[K: ClassTag, V: ClassTag](dstream: DStream[(K, V)]) = {
+  implicit def fromPairDStream[K: ClassTag, V: ClassTag](dstream: DStream[(K, V)])
+  : JavaPairDStream[K, V] = {
     new JavaPairDStream[K, V](dstream)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index e878285f6a854..9eecbfaef363f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -45,7 +45,7 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas
   // Files with mod time earlier than this is ignored. This is updated every interval
   // such that in the current interval, files older than any file found in the
   // previous interval will be ignored. Obviously this time keeps moving forward.
-  private var ignoreTime = if (newFilesOnly) 0L else System.currentTimeMillis()
+  private var ignoreTime = if (newFilesOnly) System.currentTimeMillis() else 0L
 
   // Latest file mod time seen till any point of time
   @transient private var path_ : Path = null
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index 6376cff78b78a..ed7da6dc1315e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -41,7 +41,7 @@ class QueueInputDStream[T: ClassTag](
     if (oneAtATime && queue.size > 0) {
       buffer += queue.dequeue()
     } else {
-      buffer ++= queue
+      buffer ++= queue.dequeueAll(_ => true)
     }
     if (buffer.size > 0) {
       if (oneAtATime) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
index 40da31318942e..1a47089e513c4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReducedWindowedDStream.scala
@@ -133,17 +133,17 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
     val numOldValues = oldRDDs.size
     val numNewValues = newRDDs.size
 
-    val mergeValues = (seqOfValues: Seq[Seq[V]]) => {
-      if (seqOfValues.size != 1 + numOldValues + numNewValues) {
+    val mergeValues = (arrayOfValues: Array[Iterable[V]]) => {
+      if (arrayOfValues.size != 1 + numOldValues + numNewValues) {
         throw new Exception("Unexpected number of sequences of reduced values")
       }
       // Getting reduced values "old time steps" that will be removed from current window
-      val oldValues = (1 to numOldValues).map(i => seqOfValues(i)).filter(!_.isEmpty).map(_.head)
+      val oldValues = (1 to numOldValues).map(i => arrayOfValues(i)).filter(!_.isEmpty).map(_.head)
       // Getting reduced values "new time steps"
       val newValues =
-        (1 to numNewValues).map(i => seqOfValues(numOldValues + i)).filter(!_.isEmpty).map(_.head)
+        (1 to numNewValues).map(i => arrayOfValues(numOldValues + i)).filter(!_.isEmpty).map(_.head)
 
-      if (seqOfValues(0).isEmpty) {
+      if (arrayOfValues(0).isEmpty) {
         // If previous window's reduce value does not exist, then at least new values should exist
         if (newValues.isEmpty) {
           throw new Exception("Neither previous window has value for key, nor new values found. " +
@@ -153,7 +153,7 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
         newValues.reduce(reduceF) // return
       } else {
         // Get the previous window's reduced value
-        var tempValue = seqOfValues(0).head
+        var tempValue = arrayOfValues(0).head
         // If old values exists, then inverse reduce then from previous value
         if (!oldValues.isEmpty) {
           tempValue = invReduceF(tempValue, oldValues.reduce(reduceF))
@@ -166,7 +166,8 @@ class ReducedWindowedDStream[K: ClassTag, V: ClassTag](
       }
     }
 
-    val mergedValuesRDD = cogroupedRDD.asInstanceOf[RDD[(K,Seq[Seq[V]])]].mapValues(mergeValues)
+    val mergedValuesRDD = cogroupedRDD.asInstanceOf[RDD[(K, Array[Iterable[V]])]]
+      .mapValues(mergeValues)
 
     if (filterFunc.isDefined) {
       Some(mergedValuesRDD.filter(filterFunc.get))
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala
index 743be58950c09..1868a1ebc7b4a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ActorReceiver.scala
@@ -68,13 +68,13 @@ object ActorSupervisorStrategy {
  *       should be same.
  */
 @DeveloperApi
-trait ActorHelper {
+trait ActorHelper extends Logging{
 
   self: Actor => // to ensure that this can be added to Actor classes only
 
   /** Store an iterator of received data as a data block into Spark's memory. */
   def store[T](iter: Iterator[T]) {
-    println("Storing iterator")
+    logDebug("Storing iterator")
     context.parent ! IteratorData(iter)
   }
 
@@ -84,6 +84,7 @@ trait ActorHelper {
    * that Spark is configured to use.
    */
   def store(bytes: ByteBuffer) {
+    logDebug("Storing Bytes")
     context.parent ! ByteBufferData(bytes)
   }
 
@@ -93,7 +94,7 @@ trait ActorHelper {
    * being pushed into Spark's memory.
    */
   def store[T](item: T) {
-    println("Storing item")
+    logDebug("Storing item")
     context.parent ! SingleItemData(item)
   }
 }
@@ -157,15 +158,16 @@ private[streaming] class ActorReceiver[T: ClassTag](
     def receive = {
 
       case IteratorData(iterator) =>
-        println("received iterator")
+        logDebug("received iterator")
         store(iterator.asInstanceOf[Iterator[T]])
 
       case SingleItemData(msg) =>
-        println("received single")
+        logDebug("received single")
         store(msg.asInstanceOf[T])
         n.incrementAndGet
 
       case ByteBufferData(bytes) =>
+        logDebug("received bytes")
         store(bytes)
 
       case props: Props =>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
index 78cc2daa56e53..0316b6862f195 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -44,7 +44,7 @@ private[streaming] class BlockGenerator(
     listener: BlockGeneratorListener,
     receiverId: Int,
     conf: SparkConf
-  ) extends Logging {
+  ) extends RateLimiter(conf) with Logging {
 
   private case class Block(id: StreamBlockId, buffer: ArrayBuffer[Any])
 
@@ -81,6 +81,7 @@ private[streaming] class BlockGenerator(
    * will be periodically pushed into BlockManager.
    */
   def += (data: Any): Unit = synchronized {
+    waitToPush()
     currentBuffer += data
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
new file mode 100644
index 0000000000000..e4f6ba626ebbf
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.receiver
+
+import org.apache.spark.{Logging, SparkConf}
+import java.util.concurrent.TimeUnit._
+
+/** Provides waitToPush() method to limit the rate at which receivers consume data.
+  *
+  * waitToPush method will block the thread if too many messages have been pushed too quickly,
+  * and only return when a new message has been pushed. It assumes that only one message is
+  * pushed at a time.
+  *
+  * The spark configuration spark.streaming.receiver.maxRate gives the maximum number of messages
+  * per second that each receiver will accept.
+  *
+  * @param conf spark configuration
+  */
+private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging {
+
+  private var lastSyncTime = System.nanoTime
+  private var messagesWrittenSinceSync = 0L
+  private val desiredRate = conf.getInt("spark.streaming.receiver.maxRate", 0)
+  private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS)
+
+  def waitToPush() {
+    if( desiredRate <= 0 ) {
+      return
+    }
+    val now = System.nanoTime
+    val elapsedNanosecs = math.max(now - lastSyncTime, 1)
+    val rate = messagesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs
+    if (rate < desiredRate) {
+      // It's okay to write; just update some variables and return
+      messagesWrittenSinceSync += 1
+      if (now > lastSyncTime + SYNC_INTERVAL) {
+        // Sync interval has passed; let's resync
+        lastSyncTime = now
+        messagesWrittenSinceSync = 1
+      }
+    } else {
+      // Calculate how much time we should sleep to bring ourselves to the desired rate.
+      val targetTimeInMillis = messagesWrittenSinceSync * 1000 / desiredRate
+      val elapsedTimeInMillis = elapsedNanosecs / 1000000
+      val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis
+      if (sleepTimeInMillis > 0) {
+        logTrace("Natural rate is " + rate + " per second but desired rate is " +
+          desiredRate + ", sleeping for " + sleepTimeInMillis + " ms to compensate.")
+        Thread.sleep(sleepTimeInMillis)
+      }
+      waitToPush()
+    }
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
index 09be3a50d2dfa..1f0244c251eba 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
@@ -138,7 +138,7 @@ private[streaming] abstract class ReceiverSupervisor(
       onReceiverStop(message, error)
     } catch {
       case t: Throwable =>
-        stop("Error stopping receiver " + streamId, Some(t))
+        logError("Error stopping receiver " + streamId + t.getStackTraceString)
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index ce8316bb14891..d934b9cbfc3e8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -110,8 +110,7 @@ private[streaming] class ReceiverSupervisorImpl(
     ) {
     val blockId = optionalBlockId.getOrElse(nextBlockId)
     val time = System.currentTimeMillis
-    blockManager.put(blockId, arrayBuffer.asInstanceOf[ArrayBuffer[Any]],
-      storageLevel, tellMaster = true)
+    blockManager.putArray(blockId, arrayBuffer.toArray[Any], storageLevel, tellMaster = true)
     logDebug("Pushed block " + blockId + " in " + (System.currentTimeMillis - time)  + " ms")
     reportPushedBlock(blockId, arrayBuffer.size, optionalMetadata)
   }
@@ -124,7 +123,7 @@ private[streaming] class ReceiverSupervisorImpl(
     ) {
     val blockId = optionalBlockId.getOrElse(nextBlockId)
     val time = System.currentTimeMillis
-    blockManager.put(blockId, iterator, storageLevel, tellMaster = true)
+    blockManager.putIterator(blockId, iterator, storageLevel, tellMaster = true)
     logDebug("Pushed block " + blockId + " in " + (System.currentTimeMillis - time)  + " ms")
     reportPushedBlock(blockId, -1, optionalMetadata)
   }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index cd0aa4d0dce70..952a74fd5f6de 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -29,7 +29,7 @@ import java.nio.charset.Charset
 import java.util.concurrent.{Executors, TimeUnit, ArrayBlockingQueue}
 import java.util.concurrent.atomic.AtomicInteger
 
-import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
+import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer, SynchronizedQueue}
 
 import com.google.common.io.Files
 import org.scalatest.BeforeAndAfter
@@ -39,6 +39,7 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.util.ManualClock
 import org.apache.spark.util.Utils
 import org.apache.spark.streaming.receiver.{ActorHelper, Receiver}
+import org.apache.spark.rdd.RDD
 
 class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
 
@@ -234,6 +235,95 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     logInfo("--------------------------------")
     assert(output.sum === numTotalRecords)
   }
+
+  test("queue input stream - oneAtATime=true") {
+    // Set up the streaming context and input streams
+    val ssc = new StreamingContext(conf, batchDuration)
+    val queue = new SynchronizedQueue[RDD[String]]()
+    val queueStream = ssc.queueStream(queue, oneAtATime = true)
+    val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
+    val outputStream = new TestOutputStream(queueStream, outputBuffer)
+    def output = outputBuffer.filter(_.size > 0)
+    outputStream.register()
+    ssc.start()
+
+    // Setup data queued into the stream
+    val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+    val input = Seq("1", "2", "3", "4", "5")
+    val expectedOutput = input.map(Seq(_))
+    //Thread.sleep(1000)
+    val inputIterator = input.toIterator
+    for (i <- 0 until input.size) {
+      // Enqueue more than 1 item per tick but they should dequeue one at a time
+      inputIterator.take(2).foreach(i => queue += ssc.sparkContext.makeRDD(Seq(i)))
+      clock.addToTime(batchDuration.milliseconds)
+    }
+    Thread.sleep(1000)
+    logInfo("Stopping context")
+    ssc.stop()
+
+    // Verify whether data received was as expected
+    logInfo("--------------------------------")
+    logInfo("output.size = " + outputBuffer.size)
+    logInfo("output")
+    outputBuffer.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+    logInfo("expected output.size = " + expectedOutput.size)
+    logInfo("expected output")
+    expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+    logInfo("--------------------------------")
+
+    // Verify whether all the elements received are as expected
+    assert(output.size === expectedOutput.size)
+    for (i <- 0 until output.size) {
+      assert(output(i) === expectedOutput(i))
+    }
+  }
+
+  test("queue input stream - oneAtATime=false") {
+    // Set up the streaming context and input streams
+    val ssc = new StreamingContext(conf, batchDuration)
+    val queue = new SynchronizedQueue[RDD[String]]()
+    val queueStream = ssc.queueStream(queue, oneAtATime = false)
+    val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
+    val outputStream = new TestOutputStream(queueStream, outputBuffer)
+    def output = outputBuffer.filter(_.size > 0)
+    outputStream.register()
+    ssc.start()
+
+    // Setup data queued into the stream
+    val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+    val input = Seq("1", "2", "3", "4", "5")
+    val expectedOutput = Seq(Seq("1", "2", "3"), Seq("4", "5"))
+
+    // Enqueue the first 3 items (one by one), they should be merged in the next batch
+    val inputIterator = input.toIterator
+    inputIterator.take(3).foreach(i => queue += ssc.sparkContext.makeRDD(Seq(i)))
+    clock.addToTime(batchDuration.milliseconds)
+    Thread.sleep(1000)
+
+    // Enqueue the remaining items (again one by one), merged in the final batch
+    inputIterator.foreach(i => queue += ssc.sparkContext.makeRDD(Seq(i)))
+    clock.addToTime(batchDuration.milliseconds)
+    Thread.sleep(1000)
+    logInfo("Stopping context")
+    ssc.stop()
+
+    // Verify whether data received was as expected
+    logInfo("--------------------------------")
+    logInfo("output.size = " + outputBuffer.size)
+    logInfo("output")
+    outputBuffer.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+    logInfo("expected output.size = " + expectedOutput.size)
+    logInfo("expected output")
+    expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+    logInfo("--------------------------------")
+
+    // Verify whether all the elements received are as expected
+    assert(output.size === expectedOutput.size)
+    for (i <- 0 until output.size) {
+      assert(output(i) === expectedOutput(i))
+    }
+  }
 }
 
 
@@ -293,7 +383,10 @@ class TestActor(port: Int) extends Actor with ActorHelper {
 
   def bytesToString(byteString: ByteString) = byteString.utf8String
 
-  override def preStart = IOManager(context.system).connect(new InetSocketAddress(port))
+  override def preStart(): Unit = {
+    @deprecated("suppress compile time deprecation warning", "1.0.0")
+    val unit = IOManager(context.system).connect(new InetSocketAddress(port))
+  }
 
   def receive = {
     case IO.Read(socket, bytes) =>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/NetworkReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/NetworkReceiverSuite.scala
index d9ac3c91f6e36..f4e11f975de94 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/NetworkReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/NetworkReceiverSuite.scala
@@ -145,6 +145,44 @@ class NetworkReceiverSuite extends FunSuite with Timeouts {
     assert(recordedData.toSet === generatedData.toSet)
   }
 
+  test("block generator throttling") {
+    val blockGeneratorListener = new FakeBlockGeneratorListener
+    val blockInterval = 50
+    val maxRate = 200
+    val conf = new SparkConf().set("spark.streaming.blockInterval", blockInterval.toString).
+      set("spark.streaming.receiver.maxRate", maxRate.toString)
+    val blockGenerator = new BlockGenerator(blockGeneratorListener, 1, conf)
+    val expectedBlocks = 20
+    val waitTime = expectedBlocks * blockInterval
+    val expectedMessages = maxRate * waitTime / 1000
+    val expectedMessagesPerBlock = maxRate * blockInterval / 1000
+    val generatedData = new ArrayBuffer[Int]
+
+    // Generate blocks
+    val startTime = System.currentTimeMillis()
+    blockGenerator.start()
+    var count = 0
+    while(System.currentTimeMillis - startTime < waitTime) {
+      blockGenerator += count
+      generatedData += count
+      count += 1
+      Thread.sleep(1)
+    }
+    blockGenerator.stop()
+
+    val recordedData = blockGeneratorListener.arrayBuffers
+    assert(blockGeneratorListener.arrayBuffers.size > 0)
+    assert(recordedData.flatten.toSet === generatedData.toSet)
+    // recordedData size should be close to the expected rate
+    assert(recordedData.flatten.size >= expectedMessages * 0.9 &&
+      recordedData.flatten.size <= expectedMessages * 1.1 )
+    // the first and last block may be incomplete, so we slice them out
+    recordedData.slice(1, recordedData.size - 1).foreach { block =>
+      assert(block.size >= expectedMessagesPerBlock * 0.8 &&
+        block.size <= expectedMessagesPerBlock * 1.2 )
+    }
+  }
+
   /**
    * An implementation of NetworkReceiver that is used for testing a receiver's life cycle.
    */
diff --git a/tools/pom.xml b/tools/pom.xml
index 79cd8551d0722..97abb6b2b63e0 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -26,6 +26,9 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-tools_2.10</artifactId>
+  <properties>
+    <sbt.project.name>tools</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Tools</name>
   <url>http://spark.apache.org/</url>
diff --git a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
index 03a73f92b275e..bcf6d43ab34eb 100644
--- a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
@@ -68,12 +68,11 @@ object GenerateMIMAIgnore {
     for (className <- classes) {
       try {
         val classSymbol = mirror.classSymbol(Class.forName(className, false, classLoader))
-        val moduleSymbol = mirror.staticModule(className) // TODO: see if it is necessary.
+        val moduleSymbol = mirror.staticModule(className)
         val directlyPrivateSpark =
           isPackagePrivate(classSymbol) || isPackagePrivateModule(moduleSymbol)
-        val developerApi = isDeveloperApi(classSymbol)
-        val experimental = isExperimental(classSymbol)
-
+        val developerApi = isDeveloperApi(classSymbol) || isDeveloperApi(moduleSymbol)
+        val experimental = isExperimental(classSymbol) || isExperimental(moduleSymbol)
         /* Inner classes defined within a private[spark] class or object are effectively
          invisible, so we account for them as package private. */
         lazy val indirectlyPrivateSpark = {
@@ -87,10 +86,9 @@ object GenerateMIMAIgnore {
         }
         if (directlyPrivateSpark || indirectlyPrivateSpark || developerApi || experimental) {
           ignoredClasses += className
-        } else {
-          // check if this class has package-private/annotated members.
-          ignoredMembers ++= getAnnotatedOrPackagePrivateMembers(classSymbol)
         }
+        // check if this class has package-private/annotated members.
+        ignoredMembers ++= getAnnotatedOrPackagePrivateMembers(classSymbol)
 
       } catch {
         case _: Throwable => println("Error instrumenting class:" + className)
@@ -99,9 +97,27 @@ object GenerateMIMAIgnore {
     (ignoredClasses.flatMap(c => Seq(c, c.replace("$", "#"))).toSet, ignoredMembers.toSet)
   }
 
+  /** Scala reflection does not let us see inner function even if they are upgraded
+    * to public for some reason. So had to resort to java reflection to get all inner
+    * functions with $$ in there name.
+    */
+  def getInnerFunctions(classSymbol: unv.ClassSymbol): Seq[String] = {
+    try {
+      Class.forName(classSymbol.fullName, false, classLoader).getMethods.map(_.getName)
+        .filter(_.contains("$$")).map(classSymbol.fullName + "." + _)
+    } catch {
+      case t: Throwable =>
+        println("[WARN] Unable to detect inner functions for class:" + classSymbol.fullName)
+        Seq.empty[String]
+    }
+  }
+
   private def getAnnotatedOrPackagePrivateMembers(classSymbol: unv.ClassSymbol) = {
-    classSymbol.typeSignature.members
-      .filter(x => isPackagePrivate(x) || isDeveloperApi(x) || isExperimental(x)).map(_.fullName)
+    classSymbol.typeSignature.members.filterNot(x =>
+      x.fullName.startsWith("java") || x.fullName.startsWith("scala")
+    ).filter(x =>
+      isPackagePrivate(x) || isDeveloperApi(x) || isExperimental(x)
+    ).map(_.fullName) ++ getInnerFunctions(classSymbol)
   }
 
   def main(args: Array[String]) {
diff --git a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
index 8e8c35615a711..8a05fcb449aa6 100644
--- a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
@@ -61,10 +61,9 @@ object StoragePerfTester {
       for (i <- 1 to recordsPerMap) {
         writers(i % numOutputSplits).write(writeData)
       }
-      writers.map {w =>
-        w.commit()
+      writers.map { w =>
+        w.commitAndClose()
         total.addAndGet(w.fileSegment().length)
-        w.close()
       }
 
       shuffle.releaseWriters(true)
diff --git a/yarn/alpha/pom.xml b/yarn/alpha/pom.xml
index b8a631dd0bb3b..51744ece0412d 100644
--- a/yarn/alpha/pom.xml
+++ b/yarn/alpha/pom.xml
@@ -23,6 +23,9 @@
     <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
+  <properties>
+    <sbt.project.name>yarn-alpha</sbt.project.name>
+  </properties>
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-yarn-alpha_2.10</artifactId>
diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 438737f7a6b60..62b5c3bc5f0f3 100644
--- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -60,6 +60,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
   private var yarnAllocator: YarnAllocationHandler = _
   private var isFinished: Boolean = false
   private var uiAddress: String = _
+  private var uiHistoryAddress: String = _
   private val maxAppAttempts: Int = conf.getInt(YarnConfiguration.RM_AM_MAX_RETRIES,
     YarnConfiguration.DEFAULT_RM_AM_MAX_RETRIES)
   private var isLastAMRetry: Boolean = true
@@ -184,6 +185,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 
   private def startUserClass(): Thread = {
     logInfo("Starting the user JAR in a separate Thread")
+    System.setProperty("spark.executor.instances", args.numExecutors.toString)
     val mainMethod = Class.forName(
       args.userClass,
       false /* initialize */ ,
@@ -236,6 +238,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 
         if (null != sparkContext) {
           uiAddress = sparkContext.ui.appUIHostPort
+          uiHistoryAddress = YarnSparkHadoopUtil.getUIHistoryAddress(sparkContext, sparkConf)
           this.yarnAllocator = YarnAllocationHandler.newAllocator(
             yarnConf,
             resourceManager,
@@ -254,10 +257,6 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
             sparkContext.getConf)
         }
       }
-    } finally {
-      // in case of exceptions, etc - ensure that count is atleast ALLOCATOR_LOOP_WAIT_COUNT :
-      // so that the loop (in ApplicationMaster.sparkContextInitialized) breaks
-      ApplicationMaster.incrementAllocatorLoop(ApplicationMaster.ALLOCATOR_LOOP_WAIT_COUNT)
     }
   }
 
@@ -276,13 +275,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
         }
         yarnAllocator.allocateContainers(
           math.max(args.numExecutors - yarnAllocator.getNumExecutorsRunning, 0))
-        ApplicationMaster.incrementAllocatorLoop(1)
         Thread.sleep(ApplicationMaster.ALLOCATE_HEARTBEAT_INTERVAL)
       }
-    } finally {
-      // In case of exceptions, etc - ensure that count is at least ALLOCATOR_LOOP_WAIT_COUNT,
-      // so that the loop in ApplicationMaster#sparkContextInitialized() breaks.
-      ApplicationMaster.incrementAllocatorLoop(ApplicationMaster.ALLOCATOR_LOOP_WAIT_COUNT)
     }
     logInfo("All executors have launched.")
 
@@ -368,7 +362,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
         finishReq.setAppAttemptId(appAttemptId)
         finishReq.setFinishApplicationStatus(status)
         finishReq.setDiagnostics(diagnostics)
-        finishReq.setTrackingUrl(sparkConf.get("spark.yarn.historyServer.address", ""))
+        finishReq.setTrackingUrl(uiHistoryAddress)
         resourceManager.finishApplicationMaster(finishReq)
       }
     }
@@ -410,24 +404,10 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 }
 
 object ApplicationMaster extends Logging {
-  // Number of times to wait for the allocator loop to complete.
-  // Each loop iteration waits for 100ms, so maximum of 3 seconds.
-  // This is to ensure that we have reasonable number of containers before we start
   // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
   // optimal as more containers are available. Might need to handle this better.
-  private val ALLOCATOR_LOOP_WAIT_COUNT = 30
   private val ALLOCATE_HEARTBEAT_INTERVAL = 100
 
-  def incrementAllocatorLoop(by: Int) {
-    val count = yarnAllocatorLoop.getAndAdd(by)
-    if (count >= ALLOCATOR_LOOP_WAIT_COUNT) {
-      yarnAllocatorLoop.synchronized {
-        // to wake threads off wait ...
-        yarnAllocatorLoop.notifyAll()
-      }
-    }
-  }
-
   private val applicationMasters = new CopyOnWriteArrayList[ApplicationMaster]()
 
   def register(master: ApplicationMaster) {
@@ -436,7 +416,6 @@ object ApplicationMaster extends Logging {
 
   val sparkContextRef: AtomicReference[SparkContext] =
     new AtomicReference[SparkContext](null /* initialValue */)
-  val yarnAllocatorLoop: AtomicInteger = new AtomicInteger(0)
 
   def sparkContextInitialized(sc: SparkContext): Boolean = {
     var modified = false
@@ -471,21 +450,6 @@ object ApplicationMaster extends Logging {
     modified
   }
 
-
-  /**
-   * Returns when we've either
-   *  1) received all the requested executors,
-   *  2) waited ALLOCATOR_LOOP_WAIT_COUNT * ALLOCATE_HEARTBEAT_INTERVAL ms,
-   *  3) hit an error that causes us to terminate trying to get containers.
-   */
-  def waitForInitialAllocations() {
-    yarnAllocatorLoop.synchronized {
-      while (yarnAllocatorLoop.get() <= ALLOCATOR_LOOP_WAIT_COUNT) {
-        yarnAllocatorLoop.wait(1000L)
-      }
-    }
-  }
-
   def main(argStrings: Array[String]) {
     SignalLogger.register(log)
     val args = new ApplicationMasterArguments(argStrings)
diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
index bfdb6232f5113..184e2ad6c82cd 100644
--- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
+++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
@@ -28,10 +28,10 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
 import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 import akka.actor._
 import akka.remote._
-import akka.actor.Terminated
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.util.{Utils, AkkaUtils}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.AddWebUIFilter
 import org.apache.spark.scheduler.SplitInfo
 import org.apache.spark.deploy.SparkHadoopUtil
 
@@ -56,10 +56,17 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
   private val yarnConf: YarnConfiguration = new YarnConfiguration(conf)
 
   private var yarnAllocator: YarnAllocationHandler = _
-  private var driverClosed:Boolean = false
+
+  private var driverClosed: Boolean = false
+  private var isFinished: Boolean = false
+  private var registered: Boolean = false
+
+  // Default to numExecutors * 2, with minimum of 3
+  private val maxNumExecutorFailures = sparkConf.getInt("spark.yarn.max.executor.failures",
+    sparkConf.getInt("spark.yarn.max.worker.failures", math.max(args.numExecutors * 2, 3)))
 
   val securityManager = new SecurityManager(sparkConf)
-  val actorSystem : ActorSystem = AkkaUtils.createActorSystem("sparkYarnAM", Utils.localHostName, 0,
+  val actorSystem: ActorSystem = AkkaUtils.createActorSystem("sparkYarnAM", Utils.localHostName, 0,
     conf = sparkConf, securityManager = securityManager)._1
   var actor: ActorRef = _
 
@@ -81,6 +88,9 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
       case x: DisassociatedEvent =>
         logInfo(s"Driver terminated or disconnected! Shutting down. $x")
         driverClosed = true
+      case x: AddWebUIFilter =>
+        logInfo(s"Add WebUI Filter. $x")
+        driver ! x
     }
   }
 
@@ -93,25 +103,28 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     appAttemptId = getApplicationAttemptId()
     resourceManager = registerWithResourceManager()
 
-    val appMasterResponse: RegisterApplicationMasterResponse = registerApplicationMaster()
-
-    // Compute number of threads for akka
-    val minimumMemory = appMasterResponse.getMinimumResourceCapability().getMemory()
-
-    if (minimumMemory > 0) {
-      val mem = args.executorMemory + sparkConf.getInt("spark.yarn.executor.memoryOverhead",
-        YarnAllocationHandler.MEMORY_OVERHEAD)
-      val numCore = (mem  / minimumMemory) + (if (0 != (mem % minimumMemory)) 1 else 0)
-
-      if (numCore > 0) {
-        // do not override - hits https://issues.apache.org/jira/browse/HADOOP-8406
-        // TODO: Uncomment when hadoop is on a version which has this fixed.
-        // args.workerCores = numCore
+    synchronized {
+      if (!isFinished) {
+        val appMasterResponse: RegisterApplicationMasterResponse = registerApplicationMaster()
+        // Compute number of threads for akka
+        val minimumMemory = appMasterResponse.getMinimumResourceCapability().getMemory()
+
+        if (minimumMemory > 0) {
+          val mem = args.executorMemory + sparkConf.getInt("spark.yarn.executor.memoryOverhead",
+            YarnAllocationHandler.MEMORY_OVERHEAD)
+          val numCore = (mem  / minimumMemory) + (if (0 != (mem % minimumMemory)) 1 else 0)
+
+          if (numCore > 0) {
+            // do not override - hits https://issues.apache.org/jira/browse/HADOOP-8406
+            // TODO: Uncomment when hadoop is on a version which has this fixed.
+            // args.workerCores = numCore
+          }
+        }
+        registered = true
       }
     }
-
     waitForSparkMaster()
-
+    addAmIpFilter()
     // Allocate all containers
     allocateExecutors()
 
@@ -171,7 +184,8 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
   }
 
   private def registerApplicationMaster(): RegisterApplicationMasterResponse = {
-    logInfo("Registering the ApplicationMaster")
+    val appUIAddress = sparkConf.get("spark.driver.appUIAddress", "")
+    logInfo(s"Registering the ApplicationMaster with appUIAddress: $appUIAddress")
     val appMasterRequest = Records.newRecord(classOf[RegisterApplicationMasterRequest])
       .asInstanceOf[RegisterApplicationMasterRequest]
     appMasterRequest.setApplicationAttemptId(appAttemptId)
@@ -180,10 +194,21 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     appMasterRequest.setHost(Utils.localHostName())
     appMasterRequest.setRpcPort(0)
     // What do we provide here ? Might make sense to expose something sensible later ?
-    appMasterRequest.setTrackingUrl("")
+    appMasterRequest.setTrackingUrl(appUIAddress)
     resourceManager.registerApplicationMaster(appMasterRequest)
   }
 
+  // add the yarn amIpFilter that Yarn requires for properly securing the UI
+  private def addAmIpFilter() {
+    val proxy = YarnConfiguration.getProxyHostAndPort(conf)
+    val parts = proxy.split(":")
+    val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
+    val uriBase = "http://" + proxy + proxyBase
+    val amFilter = "PROXY_HOST=" + parts(0) + "," + "PROXY_URI_BASE=" + uriBase
+    val amFilterName = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
+    actor ! AddWebUIFilter(amFilterName, amFilter, proxyBase)
+  }
+
   private def waitForSparkMaster() {
     logInfo("Waiting for spark driver to be reachable.")
     var driverUp = false
@@ -227,11 +252,17 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     while ((yarnAllocator.getNumExecutorsRunning < args.numExecutors) && (!driverClosed)) {
       yarnAllocator.allocateContainers(
         math.max(args.numExecutors - yarnAllocator.getNumExecutorsRunning, 0))
+      checkNumExecutorsFailed()
       Thread.sleep(100)
     }
 
     logInfo("All executors have launched.")
-
+  }
+  private def checkNumExecutorsFailed() {
+    if (yarnAllocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
+      finishApplicationMaster(FinalApplicationStatus.FAILED,
+        "max number of executor failures reached")
+    }
   }
 
   // TODO: We might want to extend this to allocate more containers in case they die !
@@ -241,6 +272,7 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     val t = new Thread {
       override def run() {
         while (!driverClosed) {
+          checkNumExecutorsFailed()
           val missingExecutorCount = args.numExecutors - yarnAllocator.getNumExecutorsRunning
           if (missingExecutorCount > 0) {
             logInfo("Allocating " + missingExecutorCount +
@@ -266,15 +298,23 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     yarnAllocator.allocateContainers(0)
   }
 
-  def finishApplicationMaster(status: FinalApplicationStatus) {
-
-    logInfo("finish ApplicationMaster with " + status)
-    val finishReq = Records.newRecord(classOf[FinishApplicationMasterRequest])
-      .asInstanceOf[FinishApplicationMasterRequest]
-    finishReq.setAppAttemptId(appAttemptId)
-    finishReq.setFinishApplicationStatus(status)
-    finishReq.setTrackingUrl(sparkConf.get("spark.yarn.historyServer.address", ""))
-    resourceManager.finishApplicationMaster(finishReq)
+  def finishApplicationMaster(status: FinalApplicationStatus, appMessage: String = "") {
+    synchronized {
+      if (isFinished) {
+        return
+      }
+      logInfo("Unregistering ApplicationMaster with " + status)
+      if (registered) {
+        val finishReq = Records.newRecord(classOf[FinishApplicationMasterRequest])
+          .asInstanceOf[FinishApplicationMasterRequest]
+        finishReq.setAppAttemptId(appAttemptId)
+        finishReq.setFinishApplicationStatus(status)
+        finishReq.setTrackingUrl(sparkConf.get("spark.yarn.historyServer.address", ""))
+        finishReq.setDiagnostics(appMessage)
+        resourceManager.finishApplicationMaster(finishReq)
+      }
+      isFinished = true
+    }
   }
 
 }
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
index 25cc9016b10a6..4c383ab574abe 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
@@ -26,7 +26,7 @@ class ApplicationMasterArguments(val args: Array[String]) {
   var userArgs: Seq[String] = Seq[String]()
   var executorMemory = 1024
   var executorCores = 1
-  var numExecutors = 2
+  var numExecutors = ApplicationMasterArguments.DEFAULT_NUMBER_EXECUTORS
 
   parseArgs(args.toList)
   
@@ -93,3 +93,7 @@ class ApplicationMasterArguments(val args: Array[String]) {
     System.exit(exitCode)
   }
 }
+
+object ApplicationMasterArguments {
+  val DEFAULT_NUMBER_EXECUTORS = 2
+}
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
index 556f49342977a..b7e8636e02eb2 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -109,7 +109,7 @@ trait ClientBase extends Logging {
     if (amMem > maxMem) {
 
       val errorMessage = "Required AM memory (%d) is above the max threshold (%d) of this cluster."
-        .format(args.amMemory, maxMem)
+        .format(amMem, maxMem)
       logError(errorMessage)
       throw new IllegalArgumentException(errorMessage)
     }
@@ -232,7 +232,8 @@ trait ClientBase extends Logging {
         if (!ClientBase.LOCAL_SCHEME.equals(localURI.getScheme())) {
           val setPermissions = if (destName.equals(ClientBase.APP_JAR)) true else false
           val destPath = copyRemoteFile(dst, qualifyForLocal(localURI), replication, setPermissions)
-          distCacheMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE,
+          val destFs = FileSystem.get(destPath.toUri(), conf)
+          distCacheMgr.addResource(destFs, conf, destPath, localResources, LocalResourceType.FILE,
             destName, statCache)
         } else if (confKey != null) {
           sparkConf.set(confKey, localPath)
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 718cb19f57261..e98308cdbd74e 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -30,6 +30,9 @@ import org.apache.hadoop.util.StringInterner
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.deploy.history.HistoryServer
 import org.apache.spark.deploy.SparkHadoopUtil
 
 /**
@@ -132,4 +135,17 @@ object YarnSparkHadoopUtil {
     }
   }
 
+  def getUIHistoryAddress(sc: SparkContext, conf: SparkConf) : String = {
+    val eventLogDir = sc.eventLogger match {
+      case Some(logger) => logger.getApplicationLogDir()
+      case None => ""
+    }
+    val historyServerAddress = conf.get("spark.yarn.historyServer.address", "")
+    if (historyServerAddress != "" && eventLogDir != "") {
+      historyServerAddress + HistoryServer.UI_PATH_PREFIX + s"/$eventLogDir"
+    } else {
+      ""
+    }
+  }
+
 }
diff --git a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
index 6b91e6b9eb899..3474112ded5d7 100644
--- a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
@@ -37,12 +37,4 @@ private[spark] class YarnClientClusterScheduler(sc: SparkContext, conf: Configur
     val retval = YarnAllocationHandler.lookupRack(conf, host)
     if (retval != null) Some(retval) else None
   }
-
-  override def postStartHook() {
-
-    // The yarn application is running, but the executor might not yet ready
-    // Wait for a few seconds for the slaves to bootstrap and register with master - best case attempt
-    Thread.sleep(2000L)
-    logInfo("YarnClientClusterScheduler.postStartHook done")
-  }
 }
diff --git a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index fd2694fe7278d..f8fb96b312f23 100644
--- a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler.cluster
 
 import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
 import org.apache.spark.{SparkException, Logging, SparkContext}
-import org.apache.spark.deploy.yarn.{Client, ClientArguments, ExecutorLauncher}
+import org.apache.spark.deploy.yarn.{Client, ClientArguments, ExecutorLauncher, YarnSparkHadoopUtil}
 import org.apache.spark.scheduler.TaskSchedulerImpl
 
 import scala.collection.mutable.ArrayBuffer
@@ -30,8 +30,15 @@ private[spark] class YarnClientSchedulerBackend(
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
   with Logging {
 
+  if (conf.getOption("spark.scheduler.minRegisteredExecutorsRatio").isEmpty) {
+    minRegisteredRatio = 0.8
+    ready = false
+  }
+
   var client: Client = null
   var appId: ApplicationId = null
+  var checkerThread: Thread = null
+  var stopping: Boolean = false
 
   private[spark] def addArg(optionName: String, envVar: String, sysProp: String,
       arrayBuf: ArrayBuffer[String]) {
@@ -48,6 +55,8 @@ private[spark] class YarnClientSchedulerBackend(
     val driverHost = conf.get("spark.driver.host")
     val driverPort = conf.get("spark.driver.port")
     val hostport = driverHost + ":" + driverPort
+    conf.set("spark.driver.appUIAddress", sc.ui.appUIHostPort)
+    conf.set("spark.driver.appUIHistoryAddress", YarnSparkHadoopUtil.getUIHistoryAddress(sc, conf))
 
     val argsArrayBuf = new ArrayBuffer[String]()
     argsArrayBuf += (
@@ -75,9 +84,11 @@ private[spark] class YarnClientSchedulerBackend(
 
     logDebug("ClientArguments called with: " + argsArrayBuf)
     val args = new ClientArguments(argsArrayBuf.toArray, conf)
+    totalExpectedExecutors.set(args.numExecutors)
     client = new Client(args, conf)
     appId = client.runApp()
     waitForApp()
+    checkerThread = yarnApplicationStateCheckerThread()
   }
 
   def waitForApp() {
@@ -108,7 +119,32 @@ private[spark] class YarnClientSchedulerBackend(
     }
   }
 
+  private def yarnApplicationStateCheckerThread(): Thread = {
+    val t = new Thread {
+      override def run() {
+        while (!stopping) {
+          val report = client.getApplicationReport(appId)
+          val state = report.getYarnApplicationState()
+          if (state == YarnApplicationState.FINISHED || state == YarnApplicationState.KILLED
+            || state == YarnApplicationState.FAILED) {
+            logError(s"Yarn application already ended: $state")
+            sc.stop()
+            stopping = true
+          }
+          Thread.sleep(1000L)
+        }
+        checkerThread = null
+        Thread.currentThread().interrupt()
+      }
+    }
+    t.setName("Yarn Application State Checker")
+    t.setDaemon(true)
+    t.start()
+    t
+  }
+
   override def stop() {
+    stopping = true
     super.stop()
     client.stop
     logInfo("Stopped")
diff --git a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
index 39cdd2e8a522b..9aeca4a637d38 100644
--- a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
@@ -47,12 +47,8 @@ private[spark] class YarnClusterScheduler(sc: SparkContext, conf: Configuration)
   }
 
   override def postStartHook() {
-    val sparkContextInitialized = ApplicationMaster.sparkContextInitialized(sc)
-    if (sparkContextInitialized){
-      ApplicationMaster.waitForInitialAllocations()
-      // Wait for a few seconds for the slaves to bootstrap and register with master - best case attempt
-      Thread.sleep(3000L)
-    }
+    ApplicationMaster.sparkContextInitialized(sc)
+    super.postStartHook()
     logInfo("YarnClusterScheduler.postStartHook done")
   }
 }
diff --git a/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
new file mode 100644
index 0000000000000..0ad1794d19538
--- /dev/null
+++ b/yarn/common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.spark.SparkContext
+import org.apache.spark.deploy.yarn.ApplicationMasterArguments
+import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.util.IntParam
+
+private[spark] class YarnClusterSchedulerBackend(
+    scheduler: TaskSchedulerImpl,
+    sc: SparkContext)
+  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) {
+
+  if (conf.getOption("spark.scheduler.minRegisteredExecutorsRatio").isEmpty) {
+    minRegisteredRatio = 0.8
+    ready = false
+  }
+
+  override def start() {
+    super.start()
+    var numExecutors = ApplicationMasterArguments.DEFAULT_NUMBER_EXECUTORS
+    if (System.getenv("SPARK_EXECUTOR_INSTANCES") != null) {
+      numExecutors = IntParam.unapply(System.getenv("SPARK_EXECUTOR_INSTANCES")).getOrElse(numExecutors)
+    }
+    // System property can override environment variable.
+    numExecutors = sc.getConf.getInt("spark.executor.instances", numExecutors)
+    totalExpectedExecutors.set(numExecutors)
+  }
+}
diff --git a/yarn/pom.xml b/yarn/pom.xml
index ef7066ef1fdfc..3faaf053634d6 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -28,6 +28,9 @@
   <artifactId>yarn-parent_2.10</artifactId>
   <packaging>pom</packaging>
   <name>Spark Project YARN Parent POM</name>
+  <properties>
+    <sbt.project.name>yarn</sbt.project.name>
+  </properties>
 
   <dependencies>
     <dependency>
diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml
index 0931beb505508..b6c8456d06684 100644
--- a/yarn/stable/pom.xml
+++ b/yarn/stable/pom.xml
@@ -23,6 +23,9 @@
     <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
+  <properties>
+    <sbt.project.name>yarn-stable</sbt.project.name>
+  </properties>
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-yarn_2.10</artifactId>
diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index ee1e9c9c23d22..035356d390c80 100644
--- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -59,6 +59,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
   private var yarnAllocator: YarnAllocationHandler = _
   private var isFinished: Boolean = false
   private var uiAddress: String = _
+  private var uiHistoryAddress: String = _
   private val maxAppAttempts: Int = conf.getInt(
     YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
   private var isLastAMRetry: Boolean = true
@@ -164,6 +165,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 
   private def startUserClass(): Thread = {
     logInfo("Starting the user JAR in a separate Thread")
+    System.setProperty("spark.executor.instances", args.numExecutors.toString)
     val mainMethod = Class.forName(
       args.userClass,
       false,
@@ -215,6 +217,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 
         if (sparkContext != null) {
           uiAddress = sparkContext.ui.appUIHostPort
+          uiHistoryAddress = YarnSparkHadoopUtil.getUIHistoryAddress(sparkContext, sparkConf)
           this.yarnAllocator = YarnAllocationHandler.newAllocator(
             yarnConf,
             amClient,
@@ -233,10 +236,6 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
             sparkContext.getConf)
         }
       }
-    } finally {
-      // In case of exceptions, etc - ensure that the loop in
-      // ApplicationMaster#sparkContextInitialized() breaks.
-      ApplicationMaster.doneWithInitialAllocations()
     }
   }
 
@@ -253,16 +252,9 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
         checkNumExecutorsFailed()
         allocateMissingExecutor()
         yarnAllocator.allocateResources()
-        if (iters == ApplicationMaster.ALLOCATOR_LOOP_WAIT_COUNT) {
-          ApplicationMaster.doneWithInitialAllocations()
-        }
         Thread.sleep(ApplicationMaster.ALLOCATE_HEARTBEAT_INTERVAL)
         iters += 1
       }
-    } finally {
-      // In case of exceptions, etc - ensure that the loop in
-      // ApplicationMaster#sparkContextInitialized() breaks.
-      ApplicationMaster.doneWithInitialAllocations()
     }
     logInfo("All executors have launched.")
   }
@@ -322,8 +314,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 
       logInfo("Unregistering ApplicationMaster with " + status)
       if (registered) {
-        val trackingUrl = sparkConf.get("spark.yarn.historyServer.address", "")
-        amClient.unregisterApplicationMaster(status, diagnostics, trackingUrl)
+        amClient.unregisterApplicationMaster(status, diagnostics, uiHistoryAddress)
       }
     }
   }
@@ -364,12 +355,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 }
 
 object ApplicationMaster extends Logging {
-  // Number of times to wait for the allocator loop to complete.
-  // Each loop iteration waits for 100ms, so maximum of 3 seconds.
-  // This is to ensure that we have reasonable number of containers before we start
   // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
   // optimal as more containers are available. Might need to handle this better.
-  private val ALLOCATOR_LOOP_WAIT_COUNT = 30
   private val ALLOCATE_HEARTBEAT_INTERVAL = 100
 
   private val applicationMasters = new CopyOnWriteArrayList[ApplicationMaster]()
@@ -377,20 +364,6 @@ object ApplicationMaster extends Logging {
   val sparkContextRef: AtomicReference[SparkContext] =
     new AtomicReference[SparkContext](null)
 
-  // Variable used to notify the YarnClusterScheduler that it should stop waiting
-  // for the initial set of executors to be started and get on with its business.
-  val doneWithInitialAllocationsMonitor = new Object()
-
-  @volatile var isDoneWithInitialAllocations = false
-
-  def doneWithInitialAllocations() {
-    isDoneWithInitialAllocations = true
-    doneWithInitialAllocationsMonitor.synchronized {
-      // to wake threads off wait ...
-      doneWithInitialAllocationsMonitor.notifyAll()
-    }
-  }
-
   def register(master: ApplicationMaster) {
     applicationMasters.add(master)
   }
@@ -433,20 +406,6 @@ object ApplicationMaster extends Logging {
     modified
   }
 
-  /**
-   * Returns when we've either
-   *  1) received all the requested executors,
-   *  2) waited ALLOCATOR_LOOP_WAIT_COUNT * ALLOCATE_HEARTBEAT_INTERVAL ms,
-   *  3) hit an error that causes us to terminate trying to get containers.
-   */
-  def waitForInitialAllocations() {
-    doneWithInitialAllocationsMonitor.synchronized {
-      while (!isDoneWithInitialAllocations) {
-        doneWithInitialAllocationsMonitor.wait(1000L)
-      }
-    }
-  }
-
   def getApplicationAttemptId(): ApplicationAttemptId = {
     val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
     val containerId = ConverterUtils.toContainerId(containerIdString)
diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
index f71ad036ce0f2..fc7b8320d734d 100644
--- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
+++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
@@ -19,22 +19,21 @@ package org.apache.spark.deploy.yarn
 
 import java.net.Socket
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.net.NetUtils
-import org.apache.hadoop.yarn.api._
+import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.api.protocolrecords._
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 import akka.actor._
 import akka.remote._
-import akka.actor.Terminated
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.util.{Utils, AkkaUtils}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.AddWebUIFilter
 import org.apache.spark.scheduler.SplitInfo
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.hadoop.yarn.webapp.util.WebAppUtils
 
 /**
  * An application master that allocates executors on behalf of a driver that is running outside
@@ -55,10 +54,16 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
   private val yarnConf: YarnConfiguration = new YarnConfiguration(conf)
 
   private var yarnAllocator: YarnAllocationHandler = _
-  private var driverClosed:Boolean = false
+  private var driverClosed: Boolean = false
+  private var isFinished: Boolean = false
+  private var registered: Boolean = false
 
   private var amClient: AMRMClient[ContainerRequest] = _
 
+  // Default to numExecutors * 2, with minimum of 3
+  private val maxNumExecutorFailures = sparkConf.getInt("spark.yarn.max.executor.failures",
+    sparkConf.getInt("spark.yarn.max.worker.failures", math.max(args.numExecutors * 2, 3)))
+
   val securityManager = new SecurityManager(sparkConf)
   val actorSystem: ActorSystem = AkkaUtils.createActorSystem("sparkYarnAM", Utils.localHostName, 0,
     conf = sparkConf, securityManager = securityManager)._1
@@ -82,6 +87,9 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
       case x: DisassociatedEvent =>
         logInfo(s"Driver terminated or disconnected! Shutting down. $x")
         driverClosed = true
+      case x: AddWebUIFilter =>
+        logInfo(s"Add WebUI Filter. $x")
+        driver ! x
     }
   }
 
@@ -96,9 +104,15 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     amClient.start()
 
     appAttemptId = ApplicationMaster.getApplicationAttemptId()
-    registerApplicationMaster()
+    synchronized {
+      if (!isFinished) {
+        registerApplicationMaster()
+        registered = true
+      }
+    }
 
     waitForSparkMaster()
+    addAmIpFilter()
 
     // Allocate all containers
     allocateExecutors()
@@ -142,9 +156,20 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
   }
 
   private def registerApplicationMaster(): RegisterApplicationMasterResponse = {
-    logInfo("Registering the ApplicationMaster")
-    // TODO: Find out client's Spark UI address and fill in here?
-    amClient.registerApplicationMaster(Utils.localHostName(), 0, "")
+    val appUIAddress = sparkConf.get("spark.driver.appUIAddress", "")
+    logInfo(s"Registering the ApplicationMaster with appUIAddress: $appUIAddress")
+    amClient.registerApplicationMaster(Utils.localHostName(), 0, appUIAddress)
+  }
+
+  // add the yarn amIpFilter that Yarn requires for properly securing the UI
+  private def addAmIpFilter() {
+    val proxy = WebAppUtils.getProxyHostAndPort(conf)
+    val parts = proxy.split(":")
+    val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
+    val uriBase = "http://" + proxy + proxyBase
+    val amFilter = "PROXY_HOST=" + parts(0) + "," + "PROXY_URI_BASE=" + uriBase
+    val amFilterName = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
+    actor ! AddWebUIFilter(amFilterName, amFilter, proxyBase)
   }
 
   private def waitForSparkMaster() {
@@ -193,6 +218,7 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     yarnAllocator.addResourceRequests(args.numExecutors)
     yarnAllocator.allocateResources()
     while ((yarnAllocator.getNumExecutorsRunning < args.numExecutors) && (!driverClosed)) {
+      checkNumExecutorsFailed()
       allocateMissingExecutor()
       yarnAllocator.allocateResources()
       Thread.sleep(100)
@@ -211,12 +237,20 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     }
   }
 
+  private def checkNumExecutorsFailed() {
+    if (yarnAllocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
+      finishApplicationMaster(FinalApplicationStatus.FAILED,
+        "max number of executor failures reached")
+    }
+  }
+
   private def launchReporterThread(_sleepTime: Long): Thread = {
     val sleepTime = if (_sleepTime <= 0) 0 else _sleepTime
 
     val t = new Thread {
       override def run() {
         while (!driverClosed) {
+          checkNumExecutorsFailed()
           allocateMissingExecutor()
           logDebug("Sending progress")
           yarnAllocator.allocateResources()
@@ -231,10 +265,18 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     t
   }
 
-  def finishApplicationMaster(status: FinalApplicationStatus) {
-    logInfo("Unregistering ApplicationMaster with " + status)
-    val trackingUrl = sparkConf.get("spark.yarn.historyServer.address", "")
-    amClient.unregisterApplicationMaster(status, "" /* appMessage */ , trackingUrl)
+  def finishApplicationMaster(status: FinalApplicationStatus, appMessage: String = "") {
+    synchronized {
+      if (isFinished) {
+        return
+      }
+      logInfo("Unregistering ApplicationMaster with " + status)
+      if (registered) {
+        val trackingUrl = sparkConf.get("spark.yarn.historyServer.address", "")
+        amClient.unregisterApplicationMaster(status, appMessage, trackingUrl)
+      }
+      isFinished = true
+    }
   }
 
 }