Skip to content

Commit

Permalink
[SPARK-7752] [MLLIB] Use lowercase letters for NaiveBayes.modelType
Browse files Browse the repository at this point in the history
to be consistent with other string names in MLlib. This PR also updates the implementation to use vals instead of hardcoded strings. jkbradley leahmcguire

Author: Xiangrui Meng <[email protected]>

Closes apache#6277 from mengxr/SPARK-7752 and squashes the following commits:

f38b662 [Xiangrui Meng] add another case _ back in test
ae5c66a [Xiangrui Meng] model type -> modelType
711d1c6 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7752
40ae53e [Xiangrui Meng] fix Java test suite
264a814 [Xiangrui Meng] add case _ back
3c456a8 [Xiangrui Meng] update NB user guide
17bba53 [Xiangrui Meng] update naive Bayes to use lowercase model type strings
  • Loading branch information
mengxr authored and jeanlyn committed Jun 12, 2015
1 parent bcb56ef commit 737fa8d
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 59 deletions.
9 changes: 5 additions & 4 deletions docs/mllib-naive-bayes.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Within that context, each observation is a document and each
feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or
a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes).
Feature values must be nonnegative. The model type is selected with an optional parameter
"Multinomial" or "Bernoulli" with "Multinomial" as the default.
"multinomial" or "bernoulli" with "multinomial" as the default.
[Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by
setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature
vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of
Expand All @@ -35,7 +35,7 @@ sparsity. Since the training data is only used once, it is not necessary to cach
[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
multinomial naive Bayes. It takes an RDD of
[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
smoothing parameter `lambda` as input, an optional model type parameter (default is Multinomial), and outputs a
smoothing parameter `lambda` as input, an optional model type parameter (default is "multinomial"), and outputs a
[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
can be used for evaluation and prediction.

Expand All @@ -54,7 +54,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0)
val test = splits(1)

val model = NaiveBayes.train(training, lambda = 1.0, model = "Multinomial")
val model = NaiveBayes.train(training, lambda = 1.0, model = "multinomial")

val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
Expand All @@ -75,14 +75,15 @@ optionally smoothing parameter `lambda` as input, and output a
can be used for evaluation and prediction.

{% highlight java %}
import scala.Tuple2;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.classification.NaiveBayes;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import scala.Tuple2;

JavaRDD<LabeledPoint> training = ... // training set
JavaRDD<LabeledPoint> test = ... // test set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,20 @@ import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.{Logging, SparkContext, SparkException}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.{Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}


/**
* Model for Naive Bayes Classifiers.
*
* @param labels list of labels
* @param pi log of class priors, whose dimension is C, number of labels
* @param theta log of class conditional probabilities, whose dimension is C-by-D,
* where D is number of features
* @param modelType The type of NB model to fit can be "Multinomial" or "Bernoulli"
* @param modelType The type of NB model to fit can be "multinomial" or "bernoulli"
*/
class NaiveBayesModel private[mllib] (
val labels: Array[Double],
Expand All @@ -48,11 +47,13 @@ class NaiveBayesModel private[mllib] (
val modelType: String)
extends ClassificationModel with Serializable with Saveable {

import NaiveBayes.{Bernoulli, Multinomial, supportedModelTypes}

private val piVector = new DenseVector(pi)
private val thetaMatrix = new DenseMatrix(labels.size, theta(0).size, theta.flatten, true)
private val thetaMatrix = new DenseMatrix(labels.length, theta(0).length, theta.flatten, true)

private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
this(labels, pi, theta, "Multinomial")
this(labels, pi, theta, NaiveBayes.Multinomial)

/** A Java-friendly constructor that takes three Iterable parameters. */
private[mllib] def this(
Expand All @@ -61,12 +62,15 @@ class NaiveBayesModel private[mllib] (
theta: JIterable[JIterable[Double]]) =
this(labels.asScala.toArray, pi.asScala.toArray, theta.asScala.toArray.map(_.asScala.toArray))

require(supportedModelTypes.contains(modelType),
s"Invalid modelType $modelType. Supported modelTypes are $supportedModelTypes.")

// Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
// This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
// application of this condition (in predict function).
private val (thetaMinusNegTheta, negThetaSum) = modelType match {
case "Multinomial" => (None, None)
case "Bernoulli" =>
case Multinomial => (None, None)
case Bernoulli =>
val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
val ones = new DenseVector(Array.fill(thetaMatrix.numCols){1.0})
val thetaMinusNegTheta = thetaMatrix.map { value =>
Expand All @@ -75,7 +79,7 @@ class NaiveBayesModel private[mllib] (
(Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
case _ =>
// This should never happen.
throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
throw new UnknownError(s"Invalid modelType: $modelType.")
}

override def predict(testData: RDD[Vector]): RDD[Double] = {
Expand All @@ -88,15 +92,15 @@ class NaiveBayesModel private[mllib] (

override def predict(testData: Vector): Double = {
modelType match {
case "Multinomial" =>
case Multinomial =>
val prob = thetaMatrix.multiply(testData)
BLAS.axpy(1.0, piVector, prob)
labels(prob.argmax)
case "Bernoulli" =>
case Bernoulli =>
testData.foreachActive { (index, value) =>
if (value != 0.0 && value != 1.0) {
throw new SparkException(
s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
s"Bernoulli naive Bayes requires 0 or 1 feature values but found $testData.")
}
}
val prob = thetaMinusNegTheta.get.multiply(testData)
Expand All @@ -105,7 +109,7 @@ class NaiveBayesModel private[mllib] (
labels(prob.argmax)
case _ =>
// This should never happen.
throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
throw new UnknownError(s"Invalid modelType: $modelType.")
}
}

Expand Down Expand Up @@ -230,16 +234,16 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
s"($loadedClassName, $version). Supported:\n" +
s" ($classNameV1_0, 1.0)")
}
assert(model.pi.size == numClasses,
assert(model.pi.length == numClasses,
s"NaiveBayesModel.load expected $numClasses classes," +
s" but class priors vector pi had ${model.pi.size} elements")
assert(model.theta.size == numClasses,
s" but class priors vector pi had ${model.pi.length} elements")
assert(model.theta.length == numClasses,
s"NaiveBayesModel.load expected $numClasses classes," +
s" but class conditionals array theta had ${model.theta.size} elements")
assert(model.theta.forall(_.size == numFeatures),
s" but class conditionals array theta had ${model.theta.length} elements")
assert(model.theta.forall(_.length == numFeatures),
s"NaiveBayesModel.load expected $numFeatures features," +
s" but class conditionals array theta had elements of size:" +
s" ${model.theta.map(_.size).mkString(",")}")
s" ${model.theta.map(_.length).mkString(",")}")
model
}
}
Expand All @@ -257,9 +261,11 @@ class NaiveBayes private (
private var lambda: Double,
private var modelType: String) extends Serializable with Logging {

def this(lambda: Double) = this(lambda, "Multinomial")
import NaiveBayes.{Bernoulli, Multinomial}

def this() = this(1.0, "Multinomial")
def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial)

def this() = this(1.0, NaiveBayes.Multinomial)

/** Set the smoothing parameter. Default: 1.0. */
def setLambda(lambda: Double): NaiveBayes = {
Expand All @@ -272,12 +278,11 @@ class NaiveBayes private (

/**
* Set the model type using a string (case-sensitive).
* Supported options: "Multinomial" and "Bernoulli".
* (default: Multinomial)
* Supported options: "multinomial" (default) and "bernoulli".
*/
def setModelType(modelType:String): NaiveBayes = {
def setModelType(modelType: String): NaiveBayes = {
require(NaiveBayes.supportedModelTypes.contains(modelType),
s"NaiveBayes was created with an unknown ModelType: $modelType")
s"NaiveBayes was created with an unknown modelType: $modelType.")
this.modelType = modelType
this
}
Expand Down Expand Up @@ -308,7 +313,7 @@ class NaiveBayes private (
}
if (!values.forall(v => v == 0.0 || v == 1.0)) {
throw new SparkException(
s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $v.")
s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
}
}

Expand All @@ -317,7 +322,7 @@ class NaiveBayes private (
// TODO: similar to reduceByKeyLocally to save one stage.
val aggregated = data.map(p => (p.label, p.features)).combineByKey[(Long, DenseVector)](
createCombiner = (v: Vector) => {
if (modelType == "Bernoulli") {
if (modelType == Bernoulli) {
requireZeroOneBernoulliValues(v)
} else {
requireNonnegativeValues(v)
Expand Down Expand Up @@ -352,11 +357,11 @@ class NaiveBayes private (
labels(i) = label
pi(i) = math.log(n + lambda) - piLogDenom
val thetaLogDenom = modelType match {
case "Multinomial" => math.log(sumTermFreqs.values.sum + numFeatures * lambda)
case "Bernoulli" => math.log(n + 2.0 * lambda)
case Multinomial => math.log(sumTermFreqs.values.sum + numFeatures * lambda)
case Bernoulli => math.log(n + 2.0 * lambda)
case _ =>
// This should never happen.
throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType")
throw new UnknownError(s"Invalid modelType: $modelType.")
}
var j = 0
while (j < numFeatures) {
Expand All @@ -375,8 +380,14 @@ class NaiveBayes private (
*/
object NaiveBayes {

/** String name for multinomial model type. */
private[classification] val Multinomial: String = "multinomial"

/** String name for Bernoulli model type. */
private[classification] val Bernoulli: String = "bernoulli"

/* Set of modelTypes that NaiveBayes supports */
private[mllib] val supportedModelTypes = Set("Multinomial", "Bernoulli")
private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)

/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
Expand Down Expand Up @@ -406,7 +417,7 @@ object NaiveBayes {
* @param lambda The smoothing parameter
*/
def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
new NaiveBayes(lambda, "Multinomial").run(input)
new NaiveBayes(lambda, Multinomial).run(input)
}

/**
Expand All @@ -429,7 +440,7 @@ object NaiveBayes {
*/
def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
require(supportedModelTypes.contains(modelType),
s"NaiveBayes was created with an unknown ModelType: $modelType")
s"NaiveBayes was created with an unknown modelType: $modelType.")
new NaiveBayes(lambda, modelType).run(input)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public Vector call(LabeledPoint v) throws Exception {
@Test
public void testModelTypeSetters() {
NaiveBayes nb = new NaiveBayes()
.setModelType("Bernoulli")
.setModelType("Multinomial");
.setModelType("bernoulli")
.setModelType("multinomial");
}
}
Loading

0 comments on commit 737fa8d

Please sign in to comment.