Moved streaming loader to MLUtils

xiliu82 · Aug 1, 2014 · 7d51378 · 7d51378
1 parent b9b69f6
commit 7d51378
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 42 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLStreamingUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLStreamingUtils.scala
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -30,6 +30,8 @@ import org.apache.spark.util.random.BernoulliSampler
 import org.apache.spark.mllib.regression.{LabeledPointParser, LabeledPoint}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.dstream.DStream
 
 /**
  * Helper methods to load, save and pre-process data used in ML Lib.
@@ -212,6 +214,17 @@ object MLUtils {
   def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
     loadLabeledPoints(sc, dir, sc.defaultMinPartitions)
 
+  /**
+   * Loads streaming labeled points from a stream of text files
+   * where points are in the same format as used in `RDD[LabeledPoint].saveAsTextFile`.
+   *
+   * @param ssc Streaming context
+   * @param dir Directory path in any Hadoop-supported file system URI
+   * @return Labeled points stored as a DStream[LabeledPoint]
+   */
+  def loadStreamingLabeledPoints(ssc: StreamingContext, dir: String): DStream[LabeledPoint] =
+    ssc.textFileStream(dir).map(LabeledPointParser.parse)
+
   /**
    * Load labeled data from a file. The data format used here is
    * <L>, <f1> <f2> ...