From 9b0e05147f5baceb3c63d5d5c5ad3a4dacc6a3d8 Mon Sep 17 00:00:00 2001
From: schmit <schmit@stanford.edu>
Date: Fri, 14 Mar 2014 14:49:39 -0700
Subject: [PATCH 1/6] Copy from incubator-spark

Still have to remove the dataset tests from the unit tests
---
 .../BinaryClassificationModel.scala           | 68 ++++++++++++++
 .../classification/ClassificationModel.scala  | 16 ++++
 .../classification/LogisticRegression.scala   | 54 +++++++++++-
 .../mllib/classification/NaiveBayes.scala     |  5 ++
 .../spark/mllib/classification/SVM.scala      | 39 +++++++-
 .../GeneralizedLinearAlgorithm.scala          |  3 +-
 .../BinaryClassificationEvaluationSuite.scala | 88 +++++++++++++++++++
 7 files changed, 268 insertions(+), 5 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
new file mode 100644
index 0000000000000..85542ec2aefad
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.classification
+
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.regression._
+import org.apache.spark.rdd.RDD
+
+trait BinaryClassificationModel extends ClassificationModel {
+  /**
+   * Return true labels and prediction scores in an RDD
+   *
+   * @param input RDD with labelled points to use for the evaluation
+   * @return RDD[(Double, Double)] Contains a pair of (label, probability)
+   *         where probability is the probability the model assigns to
+   *         the label being 1.
+   */ 
+  def scoreForEval(input: RDD[LabeledPoint]) : RDD[(Double, Double)] = {
+    val predictionAndLabel = input.map { point =>
+        val scores = score(point.features)
+        (scores, point.label)
+    }
+    predictionAndLabel
+  }
+
+  /**
+   * Evaluate the performance of the model using the score assigned by the model
+   * to observations and the true label.
+   * Returns the Receiver operating characteristic area under the curve.
+   * Note that we consider the prediction of a label to be 0 if the score is less than 0,
+   * and we predict label 1 if the score is larger than 0.
+   *
+   * @param predictionAndLabel RDD with (score by model, true label)
+   * @return Double Area under curve of ROC
+   */ 
+  def areaUnderROC(predictionAndLabel: RDD[(Double, Double)]) : Double = {
+    val nObs = predictionAndLabel.count
+    val nPos = predictionAndLabel.filter(x => x._2 == 1.0).count
+    // sort according to the predicted score and add indices
+    val sortedPredictionsWithIndex = predictionAndLabel.sortByKey(true).zipWithIndex
+    // sum of the positive ranks
+    val sumPosRanks = sortedPredictionsWithIndex.filter(x => (x._1)._2 > 0).map(x => x._2 + 1).sum
+    // if there are no positive or no negative labels, the area under the curve is not defined.
+    // Return 0 in that case.
+    if ((nPos > 0) && (nObs > nPos)) {
+          (sumPosRanks - nPos * (nPos + 1) / 2) / (nPos * (nObs - nPos))
+        } else {
+          0
+        }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index 391f5b9b7a7de..8a43101ca0c46 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -35,4 +35,20 @@ trait ClassificationModel extends Serializable {
    * @return Int prediction from the trained model
    */
   def predict(testData: Array[Double]): Double
+
+  /**
+   * Score values for the given data set using the model trained.
+   *
+   * @param testData RDD representing data points to be predicted
+   * @return RDD[Double] where each entry contains the corresponding prediction
+   */
+  def score(testData: RDD[Array[Double]]): RDD[Double]
+
+  /**
+   * Score values for a single data point using the model trained.
+   *
+   * @param testData array representing a single data point
+   * @return Double prediction from the trained model
+   */
+  def score(testData: Array[Double]): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index a481f522761e2..69e8da07c5e24 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -20,6 +20,8 @@ package org.apache.spark.mllib.classification
 import scala.math.round
 
 import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.regression._
@@ -38,12 +40,60 @@ class LogisticRegressionModel(
     override val weights: Array[Double],
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept)
-  with ClassificationModel with Serializable {
+  with BinaryClassificationModel with Serializable {
+
+  /**
+   * Predict probabilties for a single data point using the model trained.
+   *
+   * @param testData array representing a single data point
+   * @return Double prediction from the trained model
+   */
 
   override def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
       intercept: Double) = {
+    if (predictScore(dataMatrix, weightMatrix, intercept) < 0) 0.0 else 1.0
+  }
+
+  /**
+   * Returns true label and predicted score using the model trained.
+   *
+   * @param labeledData array representing an array of labelled data
+   * @return Double with score of linear model. One can obtain probabilities by
+   *         applying the logistic (or sigmoid) function
+   */
+  def predictScore(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
+      intercept: Double) = {
     val margin = dataMatrix.mmul(weightMatrix).get(0) + intercept
-    round(1.0/ (1.0 + math.exp(margin * -1)))
+    margin
+  }
+
+  /**
+   * Score values for the given data set using the model trained.
+   *
+   * @param testData RDD representing data points to be predicted
+   * @return RDD[Double] where each entry contains the corresponding prediction
+   */
+  def score(testData: RDD[Array[Double]]): RDD[Double] = {
+    // A small optimization to avoid serializing the entire model. Only the weightsMatrix
+    // and intercept is needed.
+    val localWeights = weightsMatrix
+    val localIntercept = intercept
+
+    testData.map { x =>
+      val dataMatrix = new DoubleMatrix(1, x.length, x:_*)
+      predictScore(dataMatrix, localWeights, localIntercept)
+    }
+  }
+
+  /**
+   * Predict values for a single data point using the model trained.
+   *
+   * @param testData array representing a single data point
+   * @return Double prediction from the trained model
+   */
+  def score(testData: Array[Double]): Double = {
+    val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
+    predictScore(dataMat, weightsMatrix, intercept)
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 6539b2f339465..90bf1dbe10c58 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -46,6 +46,11 @@ class NaiveBayesModel(val pi: Array[Double], val theta: Array[Array[Double]])
     val result = _pi.add(_theta.mmul(dataMatrix))
     result.argmax()
   }
+
+  /// same as predict in this case
+  def score(testData: RDD[Array[Double]]): RDD[Double] = testData.map(score)
+
+  def score(testData: Array[Double]): Double = predict(testData)
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 6dff29dfb45cc..4f36137420904 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -36,13 +36,48 @@ class SVMModel(
     override val weights: Array[Double],
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept)
-  with ClassificationModel with Serializable {
+  with BinaryClassificationModel with Serializable {
 
   override def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
       intercept: Double) = {
-    val margin = dataMatrix.dot(weightMatrix) + intercept
+    val margin = predictScore(dataMatrix, weightMatrix, intercept)
     if (margin < 0) 0.0 else 1.0
   }
+
+  ///
+  def predictScore(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
+      intercept: Double) = {
+    dataMatrix.dot(weightMatrix) + intercept
+  }
+
+  /**
+   * Score values for the given data set using the model trained.
+   *
+   * @param testData RDD representing data points to be predicted
+   * @return RDD[Double] where each entry contains the corresponding score
+   */
+  def score(testData: RDD[Array[Double]]): RDD[Double] = {
+    // A small optimization to avoid serializing the entire model. Only the weightsMatrix
+    // and intercept is needed.
+    val localWeights = weightsMatrix
+    val localIntercept = intercept
+
+    testData.map { x =>
+      val dataMatrix = new DoubleMatrix(1, x.length, x:_*)
+      predictScore(dataMatrix, localWeights, localIntercept)
+    }
+  }
+
+  /**
+   * Score values for a single data point using the model trained.
+   *
+   * @param testData array representing a single data point
+   * @return Double score from the trained model
+   */
+  def score(testData: Array[Double]): Double = {
+    val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
+    predictScore(dataMat, weightsMatrix, intercept)
+  }
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index f98b0b536deaa..f41d79c8969d6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -35,7 +35,8 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
   extends Serializable {
 
   // Create a column vector that can be used for predictions
-  private val weightsMatrix = new DoubleMatrix(weights.length, 1, weights:_*)
+  /// sven: switch back to private if possible
+  protected val weightsMatrix = new DoubleMatrix(weights.length, 1, weights:_*)
 
   /**
    * Predict the result given a data point and the weights learned.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala
new file mode 100644
index 0000000000000..0c27baeedd5ec
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.classification
+
+import scala.math.abs
+import scala.util.Random
+import scala.collection.JavaConversions._
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class BinaryClassificationEvaluationSuite extends FunSuite with LocalSparkContext 
+    with ShouldMatchers {
+  def validateResult(estVal: Double, trueVal: Double, tol: Double) {
+    abs(estVal - trueVal) should be < tol
+  }
+
+  // Test ROC area under the curve using synthetic output of a model
+  test("ROC area under curve, synthetic") {
+    val predictionAndLabelC = sc.parallelize(Array((3.0, 1.0), (-2.0, 0.0), (2.0, 1.0), (-1.0, 0.0),
+      (1.0, 1.0)))
+    val modelC = new LogisticRegressionModel(Array(0.0), 0.0)
+    val aucRocC = modelC.areaUnderROC(predictionAndLabelC)
+    validateResult(aucRocC, 1.0, 0.01)
+
+    val predictionAndLabelR = sc.parallelize(Array((0.45, 1.0), (-0.23, 0.0), (-0.34, 1.0), 
+      (-0.42, 0.0), (0.62, 1.0)))
+    val modelR = new LogisticRegressionModel(Array(0.0), 0.0)
+    val aucRocR = modelR.areaUnderROC(predictionAndLabelR)
+    validateResult(aucRocR, 0.8333, 0.01)
+  }
+
+  // Test ROC area under the curve using a small data set and logistic regression
+  test("ROC area under curve, real data, LR") {
+    val data = sc.textFile("data/sample_logistic.txt")
+    val parsedData = data.map { line =>
+    val parts = line.split(' ')
+    LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
+    }
+    
+    parsedData.cache()
+    val lr = new LogisticRegressionWithSGD()
+    lr.optimizer.setStepSize(10.0).setNumIterations(200)
+
+    val model = lr.run(parsedData)
+    val predictionAndLabel = model.scoreForEval(parsedData)
+    val aucROC = model.areaUnderROC(predictionAndLabel)
+    validateResult(aucROC, 0.84, 0.03)
+  }
+
+  // Test ROC area under the curve using a small data set and svm
+  test("ROC area under curve, real data, SVM") {
+    val data = sc.textFile("data/sample_logistic.txt")
+    val parsedData = data.map { line => val parts = line.split(' ')
+      LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
+    }
+    
+    parsedData.cache()
+
+    val svm = new SVMWithSGD()
+    svm.optimizer.setStepSize(1.0).setRegParam(1.0).setNumIterations(200)
+    val model = svm.run(parsedData)
+
+    val predictionAndLabel = model.scoreForEval(parsedData)
+    val aucROC = model.areaUnderROC(predictionAndLabel)
+    validateResult(aucROC, 0.86, 0.07)
+  }
+}

From 22e56f229c088c1c405944ebfcd3ffac1a41c518 Mon Sep 17 00:00:00 2001
From: schmit <schmit@stanford.edu>
Date: Sun, 16 Mar 2014 19:39:40 -0700
Subject: [PATCH 2/6] Remove the data tests

---
 .../BinaryClassificationEvaluationSuite.scala | 45 +++++--------------
 1 file changed, 12 insertions(+), 33 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala
index 0c27baeedd5ec..bd5ce9dd3f05f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/BinaryClassificationEvaluationSuite.scala
@@ -36,7 +36,7 @@ class BinaryClassificationEvaluationSuite extends FunSuite with LocalSparkContex
   }
 
   // Test ROC area under the curve using synthetic output of a model
-  test("ROC area under curve, synthetic") {
+  test("ROC area under curve, synthetic, LR") {
     val predictionAndLabelC = sc.parallelize(Array((3.0, 1.0), (-2.0, 0.0), (2.0, 1.0), (-1.0, 0.0),
       (1.0, 1.0)))
     val modelC = new LogisticRegressionModel(Array(0.0), 0.0)
@@ -50,39 +50,18 @@ class BinaryClassificationEvaluationSuite extends FunSuite with LocalSparkContex
     validateResult(aucRocR, 0.8333, 0.01)
   }
 
-  // Test ROC area under the curve using a small data set and logistic regression
-  test("ROC area under curve, real data, LR") {
-    val data = sc.textFile("data/sample_logistic.txt")
-    val parsedData = data.map { line =>
-    val parts = line.split(' ')
-    LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
-    }
-    
-    parsedData.cache()
-    val lr = new LogisticRegressionWithSGD()
-    lr.optimizer.setStepSize(10.0).setNumIterations(200)
-
-    val model = lr.run(parsedData)
-    val predictionAndLabel = model.scoreForEval(parsedData)
-    val aucROC = model.areaUnderROC(predictionAndLabel)
-    validateResult(aucROC, 0.84, 0.03)
-  }
-
   // Test ROC area under the curve using a small data set and svm
-  test("ROC area under curve, real data, SVM") {
-    val data = sc.textFile("data/sample_logistic.txt")
-    val parsedData = data.map { line => val parts = line.split(' ')
-      LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
-    }
-    
-    parsedData.cache()
-
-    val svm = new SVMWithSGD()
-    svm.optimizer.setStepSize(1.0).setRegParam(1.0).setNumIterations(200)
-    val model = svm.run(parsedData)
+  test("ROC area under curve, sythentic, SVM") {
+    val predictionAndLabelC = sc.parallelize(Array((3.0, 1.0), (-2.0, 0.0), (2.0, 1.0), (-1.0, 0.0),
+      (1.0, 1.0)))
+    val modelC = new SVMModel(Array(0.0), 0.0)
+    val aucRocC = modelC.areaUnderROC(predictionAndLabelC)
+    validateResult(aucRocC, 1.0, 0.01)
 
-    val predictionAndLabel = model.scoreForEval(parsedData)
-    val aucROC = model.areaUnderROC(predictionAndLabel)
-    validateResult(aucROC, 0.86, 0.07)
+    val predictionAndLabelR = sc.parallelize(Array((0.45, 1.0), (-0.23, 0.0), (-0.34, 1.0), 
+      (-0.42, 0.0), (0.62, 1.0)))
+    val modelR = new SVMModel(Array(0.0), 0.0)
+    val aucRocR = modelR.areaUnderROC(predictionAndLabelR)
+    validateResult(aucRocR, 0.8333, 0.01)
   }
 }

From ffae83b6602df83f3422f66e7de419eb5dd83d75 Mon Sep 17 00:00:00 2001
From: schmit <schmit@stanford.edu>
Date: Sun, 16 Mar 2014 19:46:48 -0700
Subject: [PATCH 3/6] remove explicit margin LR predictScore

---
 .../spark/mllib/classification/LogisticRegression.scala       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 69e8da07c5e24..ad75198851e61 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -63,8 +63,8 @@ class LogisticRegressionModel(
    */
   def predictScore(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
       intercept: Double) = {
-    val margin = dataMatrix.mmul(weightMatrix).get(0) + intercept
-    margin
+    // margin
+    dataMatrix.mmul(weightMatrix).get(0) + intercept
   }
 
   /**

From ba7de4d2daffa084988e4f2a7eb7d8a37f5f015c Mon Sep 17 00:00:00 2001
From: schmit <schmit@stanford.edu>
Date: Sun, 16 Mar 2014 19:49:03 -0700
Subject: [PATCH 4/6] removed comment

---
 .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index f41d79c8969d6..2a73ff3e49f57 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -35,7 +35,6 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
   extends Serializable {
 
   // Create a column vector that can be used for predictions
-  /// sven: switch back to private if possible
   protected val weightsMatrix = new DoubleMatrix(weights.length, 1, weights:_*)
 
   /**

From 799ee8e264467770b89e9214b17324133832904b Mon Sep 17 00:00:00 2001
From: schmit <schmit@stanford.edu>
Date: Tue, 18 Mar 2014 18:34:44 -0700
Subject: [PATCH 5/6] Fix overflow issue pointed out by @srowen

---
 .../mllib/classification/BinaryClassificationModel.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
index 85542ec2aefad..e0dc510262e50 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
@@ -51,8 +51,8 @@ trait BinaryClassificationModel extends ClassificationModel {
    * @return Double Area under curve of ROC
    */ 
   def areaUnderROC(predictionAndLabel: RDD[(Double, Double)]) : Double = {
-    val nObs = predictionAndLabel.count
-    val nPos = predictionAndLabel.filter(x => x._2 == 1.0).count
+    val nObs = predictionAndLabel.count.toDouble
+    val nPos = predictionAndLabel.filter(x => x._2 == 1.0).count.toDouble
     // sort according to the predicted score and add indices
     val sortedPredictionsWithIndex = predictionAndLabel.sortByKey(true).zipWithIndex
     // sum of the positive ranks
@@ -62,7 +62,7 @@ trait BinaryClassificationModel extends ClassificationModel {
     if ((nPos > 0) && (nObs > nPos)) {
           (sumPosRanks - nPos * (nPos + 1) / 2) / (nPos * (nObs - nPos))
         } else {
-          0
+          0.0
         }
   }
 }

From 1d0d68c93ecab163f7988d593845ef4c0b42b7c1 Mon Sep 17 00:00:00 2001
From: schmit <schmit@stanford.edu>
Date: Fri, 21 Mar 2014 10:57:59 -0700
Subject: [PATCH 6/6] update to definition of nPos, suggested by @srowen

---
 .../spark/mllib/classification/BinaryClassificationModel.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
index e0dc510262e50..c87b83b606632 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/BinaryClassificationModel.scala
@@ -52,7 +52,7 @@ trait BinaryClassificationModel extends ClassificationModel {
    */ 
   def areaUnderROC(predictionAndLabel: RDD[(Double, Double)]) : Double = {
     val nObs = predictionAndLabel.count.toDouble
-    val nPos = predictionAndLabel.filter(x => x._2 == 1.0).count.toDouble
+    val nPos = predictionAndLabel.filter(x => x._2 > 0.5).count.toDouble
     // sort according to the predicted score and add indices
     val sortedPredictionsWithIndex = predictionAndLabel.sortByKey(true).zipWithIndex
     // sum of the positive ranks