Fix

JoshRosen · Jul 8, 2024 · 1942dfb · 1942dfb
1 parent c4085f1
commit 1942dfb
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -310,7 +310,10 @@ class CountVectorizerModel(
       broadcastDict = Some(dataset.sparkSession.sparkContext.broadcast(dict))
     }
     val dictBr = broadcastDict.get
+    // SPARK-48837: capture parameter values here so that we only evaulate once-per-transform
+    // rather than once-per-row:
     val minTf = $(minTF)
+    val isBinary = $(binary)
     val vectorizer = udf { document: Seq[String] =>
       val termCounts = new OpenHashMap[Int, Double]
       var tokenCount = 0L
@@ -322,7 +325,7 @@ class CountVectorizerModel(
         tokenCount += 1
       }
       val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
-      val effectiveCounts = if ($(binary)) {
+      val effectiveCounts = if (isBinary) {
         termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
       } else {
         termCounts.filter(_._2 >= effectiveMinTF).toSeq