Skip to content

Commit

Permalink
[SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pm…
Browse files Browse the repository at this point in the history
…ml and mllib.util

Same as #8421 but for `mllib.pmml` and `mllib.util`.

cc dbtsai

Author: Xiangrui Meng <[email protected]>

Closes #8430 from mengxr/SPARK-10239 and squashes the following commits:

a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util
  • Loading branch information
mengxr authored and DB Tsai committed Aug 25, 2015
1 parent 9205907 commit 00ae4be
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
import org.jpmml.model.JAXBUtil

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory

/**
Expand All @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
* developed by the Data Mining Group (www.dmg.org).
*/
@DeveloperApi
@Since("1.4.0")
trait PMMLExportable {

/**
Expand All @@ -48,6 +49,7 @@ trait PMMLExportable {
* Export the model to a local file in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(localPath: String): Unit = {
toPMML(new StreamResult(new File(localPath)))
}
Expand All @@ -57,6 +59,7 @@ trait PMMLExportable {
* Export the model to a directory on a distributed file system in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(sc: SparkContext, path: String): Unit = {
val pmml = toPMML()
sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
Expand All @@ -67,6 +70,7 @@ trait PMMLExportable {
* Export the model to the OutputStream in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(outputStream: OutputStream): Unit = {
toPMML(new StreamResult(outputStream))
}
Expand All @@ -76,6 +80,7 @@ trait PMMLExportable {
* Export the model to a String in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(): String = {
val writer = new StringWriter
toPMML(new StreamResult(writer))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,25 @@

package org.apache.spark.mllib.util

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
* A collection of methods used to validate data before applying ML algorithms.
*/
@DeveloperApi
@Since("0.8.0")
object DataValidators extends Logging {

/**
* Function to check if labels used for classification are either zero or one.
*
* @return True if labels are all zero or one, false otherwise.
*/
@Since("1.0.0")
val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
if (numInvalid != 0) {
Expand All @@ -48,6 +50,7 @@ object DataValidators extends Logging {
*
* @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise.
*/
@Since("1.3.0")
def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x =>
x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD

/**
Expand All @@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
* cluster with scale 1 around each center.
*/
@DeveloperApi
@Since("0.8.0")
object KMeansDataGenerator {

/**
Expand All @@ -42,6 +43,7 @@ object KMeansDataGenerator {
* @param r Scaling factor for the distribution of the initial centers
* @param numPartitions Number of partitions of the generated RDD; default 2
*/
@Since("0.8.0")
def generateKMeansRDD(
sc: SparkContext,
numPoints: Int,
Expand All @@ -62,6 +64,7 @@ object KMeansDataGenerator {
}
}

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 6) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
Expand All @@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
* response variable `Y`.
*/
@DeveloperApi
@Since("0.8.0")
object LinearDataGenerator {

/**
Expand All @@ -46,6 +47,7 @@ object LinearDataGenerator {
* @param seed Random seed
* @return Java List of input.
*/
@Since("0.8.0")
def generateLinearInputAsList(
intercept: Double,
weights: Array[Double],
Expand All @@ -68,6 +70,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor.
* @return Seq of input.
*/
@Since("0.8.0")
def generateLinearInput(
intercept: Double,
weights: Array[Double],
Expand All @@ -92,6 +95,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor.
* @return Seq of input.
*/
@Since("0.8.0")
def generateLinearInput(
intercept: Double,
weights: Array[Double],
Expand Down Expand Up @@ -132,6 +136,7 @@ object LinearDataGenerator {
*
* @return RDD of LabeledPoint containing sample data.
*/
@Since("0.8.0")
def generateLinearRDD(
sc: SparkContext,
nexamples: Int,
Expand All @@ -151,6 +156,7 @@ object LinearDataGenerator {
data
}

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
Expand All @@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors
* with probability `probOne` and scales features for positive examples by `eps`.
*/
@DeveloperApi
@Since("0.8.0")
object LogisticRegressionDataGenerator {

/**
Expand All @@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator {
* @param nparts Number of partitions of the generated RDD. Default value is 2.
* @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
*/
@Since("0.8.0")
def generateLogisticRDD(
sc: SparkContext,
nexamples: Int,
Expand All @@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator {
data
}

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length != 5) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import scala.language.postfixOps
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix}
import org.apache.spark.rdd.RDD

Expand Down Expand Up @@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD
* testSampFact (Double) Percentage of training data to use as test data.
*/
@DeveloperApi
@Since("0.8.0")
object MFDataGenerator {
@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream
/**
* Helper methods to load, save and pre-process data used in ML Lib.
*/
@Since("0.8.0")
object MLUtils {

private[mllib] lazy val EPSILON = {
Expand Down Expand Up @@ -168,6 +169,7 @@ object MLUtils {
*
* @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]]
*/
@Since("1.0.0")
def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) {
// TODO: allow to specify label precision and feature precision.
val dataStr = data.map { case LabeledPoint(label, features) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,22 @@ import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
* Generate sample data used for SVM. This class generates uniform random values
* for the features and adds Gaussian noise with weight 0.1 to generate labels.
*/
@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import org.json4s._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.types.{DataType, StructField, StructType}

Expand All @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType}
* This should be inherited by the class which implements model instances.
*/
@DeveloperApi
@Since("1.3.0")
trait Saveable {

/**
Expand All @@ -50,6 +51,7 @@ trait Saveable {
* @param path Path specifying the directory in which to save this model.
* If the directory already exists, this method throws an exception.
*/
@Since("1.3.0")
def save(sc: SparkContext, path: String): Unit

/** Current version of model save/load format. */
Expand All @@ -64,6 +66,7 @@ trait Saveable {
* This should be inherited by an object paired with the model class.
*/
@DeveloperApi
@Since("1.3.0")
trait Loader[M <: Saveable] {

/**
Expand All @@ -75,6 +78,7 @@ trait Loader[M <: Saveable] {
* @param path Path specifying the directory to which the model was saved.
* @return Model instance
*/
@Since("1.3.0")
def load(sc: SparkContext, path: String): M

}
Expand Down

0 comments on commit 00ae4be

Please sign in to comment.