Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-7752][MLLIB] Use lowercase letters for NaiveBayes.modelType #6277

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions docs/mllib-naive-bayes.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Within that context, each observation is a document and each
feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or
a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes).
Feature values must be nonnegative. The model type is selected with an optional parameter
"Multinomial" or "Bernoulli" with "Multinomial" as the default.
"multinomial" or "bernoulli" with "multinomial" as the default.
[Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by
setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature
vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of
Expand All @@ -35,7 +35,7 @@ sparsity. Since the training data is only used once, it is not necessary to cach
[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
multinomial naive Bayes. It takes an RDD of
[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
smoothing parameter `lambda` as input, an optional model type parameter (default is Multinomial), and outputs a
smoothing parameter `lambda` as input, an optional model type parameter (default is "multinomial"), and outputs a
[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
can be used for evaluation and prediction.

Expand All @@ -54,7 +54,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0)
val test = splits(1)

val model = NaiveBayes.train(training, lambda = 1.0, model = "Multinomial")
val model = NaiveBayes.train(training, lambda = 1.0, model = "multinomial")

val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
Expand All @@ -75,14 +75,15 @@ optionally smoothing parameter `lambda` as input, and output a
can be used for evaluation and prediction.

{% highlight java %}
import scala.Tuple2;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.classification.NaiveBayes;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import scala.Tuple2;

JavaRDD<LabeledPoint> training = ... // training set
JavaRDD<LabeledPoint> test = ... // test set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,20 @@ import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.{Logging, SparkContext, SparkException}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, SparseVector, Vector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.{Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}


/**
* Model for Naive Bayes Classifiers.
*
* @param labels list of labels
* @param pi log of class priors, whose dimension is C, number of labels
* @param theta log of class conditional probabilities, whose dimension is C-by-D,
* where D is number of features
* @param modelType The type of NB model to fit can be "Multinomial" or "Bernoulli"
* @param modelType The type of NB model to fit can be "multinomial" or "bernoulli"
*/
class NaiveBayesModel private[mllib] (
val labels: Array[Double],
Expand All @@ -48,11 +47,13 @@ class NaiveBayesModel private[mllib] (
val modelType: String)
extends ClassificationModel with Serializable with Saveable {

import NaiveBayes.{Bernoulli, Multinomial, supportedModelTypes}

private val piVector = new DenseVector(pi)
private val thetaMatrix = new DenseMatrix(labels.size, theta(0).size, theta.flatten, true)
private val thetaMatrix = new DenseMatrix(labels.length, theta(0).length, theta.flatten, true)

private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
this(labels, pi, theta, "Multinomial")
this(labels, pi, theta, NaiveBayes.Multinomial)

/** A Java-friendly constructor that takes three Iterable parameters. */
private[mllib] def this(
Expand All @@ -61,12 +62,15 @@ class NaiveBayesModel private[mllib] (
theta: JIterable[JIterable[Double]]) =
this(labels.asScala.toArray, pi.asScala.toArray, theta.asScala.toArray.map(_.asScala.toArray))

require(supportedModelTypes.contains(modelType),
s"Invalid model type $modelType. Supported model types are $supportedModelTypes.")

// Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
// This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
// application of this condition (in predict function).
private val (thetaMinusNegTheta, negThetaSum) = modelType match {
case "Multinomial" => (None, None)
case "Bernoulli" =>
case Multinomial => (None, None)
case Bernoulli =>
val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
val ones = new DenseVector(Array.fill(thetaMatrix.numCols){1.0})
val thetaMinusNegTheta = thetaMatrix.map { value =>
Expand All @@ -75,7 +79,7 @@ class NaiveBayesModel private[mllib] (
(Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
case _ =>
// This should never happen.
throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
throw new UnknownError(s"Invalid model type: $modelType.")
}

override def predict(testData: RDD[Vector]): RDD[Double] = {
Expand All @@ -88,15 +92,15 @@ class NaiveBayesModel private[mllib] (

override def predict(testData: Vector): Double = {
modelType match {
case "Multinomial" =>
case Multinomial =>
val prob = thetaMatrix.multiply(testData)
BLAS.axpy(1.0, piVector, prob)
labels(prob.argmax)
case "Bernoulli" =>
case Bernoulli =>
testData.foreachActive { (index, value) =>
if (value != 0.0 && value != 1.0) {
throw new SparkException(
s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $testData.")
s"Bernoulli naive Bayes requires 0 or 1 feature values but found $testData.")
}
}
val prob = thetaMinusNegTheta.get.multiply(testData)
Expand All @@ -105,7 +109,7 @@ class NaiveBayesModel private[mllib] (
labels(prob.argmax)
case _ =>
// This should never happen.
throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType")
throw new UnknownError(s"Invalid model type: $modelType.")
}
}

Expand Down Expand Up @@ -230,16 +234,16 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
s"($loadedClassName, $version). Supported:\n" +
s" ($classNameV1_0, 1.0)")
}
assert(model.pi.size == numClasses,
assert(model.pi.length == numClasses,
s"NaiveBayesModel.load expected $numClasses classes," +
s" but class priors vector pi had ${model.pi.size} elements")
assert(model.theta.size == numClasses,
s" but class priors vector pi had ${model.pi.length} elements")
assert(model.theta.length == numClasses,
s"NaiveBayesModel.load expected $numClasses classes," +
s" but class conditionals array theta had ${model.theta.size} elements")
assert(model.theta.forall(_.size == numFeatures),
s" but class conditionals array theta had ${model.theta.length} elements")
assert(model.theta.forall(_.length == numFeatures),
s"NaiveBayesModel.load expected $numFeatures features," +
s" but class conditionals array theta had elements of size:" +
s" ${model.theta.map(_.size).mkString(",")}")
s" ${model.theta.map(_.length).mkString(",")}")
model
}
}
Expand All @@ -257,9 +261,11 @@ class NaiveBayes private (
private var lambda: Double,
private var modelType: String) extends Serializable with Logging {

def this(lambda: Double) = this(lambda, "Multinomial")
import NaiveBayes.{Bernoulli, Multinomial}

def this() = this(1.0, "Multinomial")
def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"NaiveBayes.Multinomial" --> "Multinomial"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't compile, because this(...) is not in the scope of NaiveBayes.


def this() = this(1.0, NaiveBayes.Multinomial)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto


/** Set the smoothing parameter. Default: 1.0. */
def setLambda(lambda: Double): NaiveBayes = {
Expand All @@ -272,12 +278,11 @@ class NaiveBayes private (

/**
* Set the model type using a string (case-sensitive).
* Supported options: "Multinomial" and "Bernoulli".
* (default: Multinomial)
* Supported options: "multinomial" (default) and "bernoulli".
*/
def setModelType(modelType:String): NaiveBayes = {
def setModelType(modelType: String): NaiveBayes = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as you're at it, can you change this to be case-insensitive so it's a little more robust?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that option in the first version. Making it case-insensitive is easy but it may cause unexpected errors in the user code.

nb.setModelType("Bernoulli")
if (nb.getModelType() == "Bernoulli") { // It becomes lowercase here.
  ...
}

I can keep the input untouched but we need to call toLowerCase every time we need it, or store two modelTypes, one matching the user input and the other with lowercase letters. The overhead seems to be unnecessary. It was case-sensitive and the PR doesn't modify this property.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough

require(NaiveBayes.supportedModelTypes.contains(modelType),
s"NaiveBayes was created with an unknown ModelType: $modelType")
s"NaiveBayes was created with an unknown model type: $modelType.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"model type" --> "modelType" to match parameter name

this.modelType = modelType
this
}
Expand Down Expand Up @@ -308,7 +313,7 @@ class NaiveBayes private (
}
if (!values.forall(v => v == 0.0 || v == 1.0)) {
throw new SparkException(
s"Bernoulli Naive Bayes requires 0 or 1 feature values but found $v.")
s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
}
}

Expand All @@ -317,7 +322,7 @@ class NaiveBayes private (
// TODO: similar to reduceByKeyLocally to save one stage.
val aggregated = data.map(p => (p.label, p.features)).combineByKey[(Long, DenseVector)](
createCombiner = (v: Vector) => {
if (modelType == "Bernoulli") {
if (modelType == Bernoulli) {
requireZeroOneBernoulliValues(v)
} else {
requireNonnegativeValues(v)
Expand Down Expand Up @@ -352,11 +357,11 @@ class NaiveBayes private (
labels(i) = label
pi(i) = math.log(n + lambda) - piLogDenom
val thetaLogDenom = modelType match {
case "Multinomial" => math.log(sumTermFreqs.values.sum + numFeatures * lambda)
case "Bernoulli" => math.log(n + 2.0 * lambda)
case Multinomial => math.log(sumTermFreqs.values.sum + numFeatures * lambda)
case Bernoulli => math.log(n + 2.0 * lambda)
case _ =>
// This should never happen.
throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType")
throw new UnknownError(s"Invalid model type: $modelType.")
}
var j = 0
while (j < numFeatures) {
Expand All @@ -375,8 +380,14 @@ class NaiveBayes private (
*/
object NaiveBayes {

/** String name for multinomial model type. */
private[classification] val Multinomial: String = "multinomial"

/** String name for Bernoulli model type. */
private[classification] val Bernoulli: String = "bernoulli"

/* Set of modelTypes that NaiveBayes supports */
private[mllib] val supportedModelTypes = Set("Multinomial", "Bernoulli")
private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)

/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
Expand Down Expand Up @@ -406,7 +417,7 @@ object NaiveBayes {
* @param lambda The smoothing parameter
*/
def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
new NaiveBayes(lambda, "Multinomial").run(input)
new NaiveBayes(lambda, Multinomial).run(input)
}

/**
Expand All @@ -429,7 +440,7 @@ object NaiveBayes {
*/
def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
require(supportedModelTypes.contains(modelType),
s"NaiveBayes was created with an unknown ModelType: $modelType")
s"NaiveBayes was created with an unknown model type: $modelType.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"model type" --> "modelType"

new NaiveBayes(lambda, modelType).run(input)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public Vector call(LabeledPoint v) throws Exception {
@Test
public void testModelTypeSetters() {
NaiveBayes nb = new NaiveBayes()
.setModelType("Bernoulli")
.setModelType("Multinomial");
.setModelType("bernoulli")
.setModelType("multinomial");
}
}
Loading