@@ -672,38 +672,47 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
672
672
673
673
/**
674
674
* Return approximate number of distinct values for each key in this RDD.
675
- * The accuracy of approximation can be controlled through the relative standard deviation
676
- * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
677
- * more accurate counts but increase the memory footprint and vise versa. Uses the provided
678
- * Partitioner to partition the output RDD.
675
+ *
676
+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
677
+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
678
+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
679
+ *
680
+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
681
+ * It must be greater than 0.000017.
682
+ * @param partitioner partitioner of the resulting RDD.
679
683
*/
680
- def countApproxDistinctByKey (relativeSD : Double , partitioner : Partitioner ): JavaRDD [(K , Long )] = {
681
- rdd.countApproxDistinctByKey(relativeSD, partitioner)
684
+ def countApproxDistinctByKey (relativeSD : Double , partitioner : Partitioner ): JavaPairRDD [K , Long ] =
685
+ {
686
+ fromRDD(rdd.countApproxDistinctByKey(relativeSD, partitioner))
682
687
}
683
688
684
689
/**
685
- * Return approximate number of distinct values for each key this RDD.
686
- * The accuracy of approximation can be controlled through the relative standard deviation
687
- * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
688
- * more accurate counts but increase the memory footprint and vise versa. The default value of
689
- * relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism
690
- * level.
690
+ * Return approximate number of distinct values for each key in this RDD.
691
+ *
692
+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
693
+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
694
+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
695
+ *
696
+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
697
+ * It must be greater than 0.000017.
698
+ * @param numPartitions number of partitions of the resulting RDD.
691
699
*/
692
- def countApproxDistinctByKey (relativeSD : Double = 0.05 ): JavaRDD [( K , Long ) ] = {
693
- rdd.countApproxDistinctByKey(relativeSD)
700
+ def countApproxDistinctByKey (relativeSD : Double , numPartitions : Int ): JavaPairRDD [ K , Long ] = {
701
+ fromRDD( rdd.countApproxDistinctByKey(relativeSD, numPartitions) )
694
702
}
695
703
696
-
697
704
/**
698
705
* Return approximate number of distinct values for each key in this RDD.
699
- * The accuracy of approximation can be controlled through the relative standard deviation
700
- * (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
701
- * more accurate counts but increase the memory footprint and vise versa. HashPartitions the
702
- * output RDD into numPartitions.
703
706
*
707
+ * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
708
+ * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
709
+ * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
710
+ *
711
+ * @param relativeSD Relative accuracy. Smaller values create counters that require more space.
712
+ * It must be greater than 0.000017.
704
713
*/
705
- def countApproxDistinctByKey (relativeSD : Double , numPartitions : Int ): JavaRDD [( K , Long ) ] = {
706
- rdd.countApproxDistinctByKey(relativeSD, numPartitions )
714
+ def countApproxDistinctByKey (relativeSD : Double ): JavaPairRDD [ K , Long ] = {
715
+ fromRDD( rdd.countApproxDistinctByKey(relativeSD) )
707
716
}
708
717
709
718
/** Assign a name to this RDD */
0 commit comments