Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
JoshRosen committed Jul 8, 2024
1 parent c4085f1 commit 1942dfb
Showing 1 changed file with 4 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,10 @@ class CountVectorizerModel(
broadcastDict = Some(dataset.sparkSession.sparkContext.broadcast(dict))
}
val dictBr = broadcastDict.get
// SPARK-48837: capture parameter values here so that we only evaulate once-per-transform
// rather than once-per-row:
val minTf = $(minTF)
val isBinary = $(binary)
val vectorizer = udf { document: Seq[String] =>
val termCounts = new OpenHashMap[Int, Double]
var tokenCount = 0L
Expand All @@ -322,7 +325,7 @@ class CountVectorizerModel(
tokenCount += 1
}
val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
val effectiveCounts = if ($(binary)) {
val effectiveCounts = if (isBinary) {
termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
} else {
termCounts.filter(_._2 >= effectiveMinTF).toSeq
Expand Down

0 comments on commit 1942dfb

Please sign in to comment.