Skip to content

Commit

Permalink
Moved streaming loader to MLUtils
Browse files Browse the repository at this point in the history
  • Loading branch information
freeman-lab committed Aug 1, 2014
1 parent b9b69f6 commit 7d51378
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 42 deletions.

This file was deleted.

13 changes: 13 additions & 0 deletions mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import org.apache.spark.util.random.BernoulliSampler
import org.apache.spark.mllib.regression.{LabeledPointParser, LabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream

/**
* Helper methods to load, save and pre-process data used in ML Lib.
Expand Down Expand Up @@ -212,6 +214,17 @@ object MLUtils {
def loadLabeledPoints(sc: SparkContext, dir: String): RDD[LabeledPoint] =
loadLabeledPoints(sc, dir, sc.defaultMinPartitions)

/**
* Loads streaming labeled points from a stream of text files
* where points are in the same format as used in `RDD[LabeledPoint].saveAsTextFile`.
*
* @param ssc Streaming context
* @param dir Directory path in any Hadoop-supported file system URI
* @return Labeled points stored as a DStream[LabeledPoint]
*/
def loadStreamingLabeledPoints(ssc: StreamingContext, dir: String): DStream[LabeledPoint] =
ssc.textFileStream(dir).map(LabeledPointParser.parse)

/**
* Load labeled data from a file. The data format used here is
* <L>, <f1> <f2> ...
Expand Down

0 comments on commit 7d51378

Please sign in to comment.