diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 9419c3f5989be..7dcfbf741c4f1 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -678,7 +678,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. * @param partitioner partitioner of the resulting RDD. */ def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaPairRDD[K, Long] = @@ -694,7 +694,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. * @param numPartitions number of partitions of the resulting RDD. */ def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaPairRDD[K, Long] = { @@ -709,7 +709,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. */ def countApproxDistinctByKey(relativeSD: Double): JavaPairRDD[K, Long] = { fromRDD(rdd.countApproxDistinctByKey(relativeSD)) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 2741532732c27..330569a8d8837 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -565,7 +565,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. */ def countApproxDistinct(relativeSD: Double): Long = rdd.countApproxDistinct(relativeSD) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index d459815ae7cbe..f2ce3cbd47f93 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -225,16 +225,16 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * would trigger sparse representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. * - *@param p The precision value for the normal set. - * `p` must be a value between 4 and `sp` (32 max). + * @param p The precision value for the normal set. + * `p` must be a value between 4 and `sp` if `sp` is not zero (32 max). * @param sp The precision value for the sparse set, between 0 and 32. * If `sp` equals 0, the sparse representation is skipped. * @param partitioner Partitioner to use for the resulting RDD. */ @Experimental def countApproxDistinctByKey(p: Int, sp: Int, partitioner: Partitioner): RDD[(K, Long)] = { - require(p >= 4, s"p ($p) should be >= 4") - require(sp <= 32, s"sp ($sp) should be <= 32") + require(p >= 4, s"p ($p) must be >= 4") + require(sp <= 32, s"sp ($sp) must be <= 32") require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)") val createHLL = (v: V) => { val hll = new HyperLogLogPlus(p, sp) @@ -261,11 +261,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. * @param partitioner partitioner of the resulting RDD */ def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): RDD[(K, Long)] = { - require(relativeSD > 0.000017, s"accuracy ($relativeSD) should be greater than 0.000017") + require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017") val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt assert(p <= 32) countApproxDistinctByKey(if (p < 4) 4 else p, 0, partitioner) @@ -279,7 +279,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. * @param numPartitions number of partitions of the resulting RDD */ def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): RDD[(K, Long)] = { @@ -294,7 +294,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. - * It should be greater than 0.000017. + * It must be greater than 0.000017. */ def countApproxDistinctByKey(relativeSD: Double = 0.05): RDD[(K, Long)] = { countApproxDistinctByKey(relativeSD, defaultPartitioner(self)) diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 58375b9b07c32..585b2f76afa65 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -929,9 +929,9 @@ abstract class RDD[T: ClassTag]( * and increase accuracy when the cardinality is small. * * @param p The precision value for the normal set. - * p must be a value between 4 and sp. + * `p` must be a value between 4 and `sp` if `sp` is not zero (32 max). * @param sp The precision value for the sparse set, between 0 and 32. - * If sp equals 0, the sparse representation is skipped. + * If `sp` equals 0, the sparse representation is skipped. */ @Experimental def countApproxDistinct(p: Int, sp: Int): Long = { @@ -958,6 +958,7 @@ abstract class RDD[T: ClassTag]( * here. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. + * It must be greater than 0.000017. */ def countApproxDistinct(relativeSD: Double = 0.05): Long = { val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt