Allow nextBatchStream to be called after we're done looking at all st…

…reams Before it could give ArrayIndexOutOfBoundsException after we read the very last element
xiliu82 · Jul 30, 2014 · fa2e8db · fa2e8db
1 parent a34b352
commit fa2e8db
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 3 deletions.
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.rdd
 
+import scala.language.existentials
+
 import java.io.{IOException, ObjectOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-import scala.language.existentials
 
 import org.apache.spark.{InterruptibleIterator, Partition, Partitioner, SparkEnv, TaskContext}
 import org.apache.spark.{Dependency, OneToOneDependency, ShuffleDependency}

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -501,8 +501,13 @@ private[spark] class ExternalSorter[K, V, C](
 
     /** Construct a stream that only reads from the next batch */
     def nextBatchStream(): InputStream = {
-      batchStreamsRead += 1
-      ByteStreams.limit(bufferedStream, spill.serializerBatchSizes(batchStreamsRead - 1))
+      if (batchStreamsRead < spill.serializerBatchSizes.length) {
+        batchStreamsRead += 1
+        ByteStreams.limit(bufferedStream, spill.serializerBatchSizes(batchStreamsRead - 1))
+      } else {
+        // No more batches left; give an empty stream
+        bufferedStream
+      }
     }
 
     /**