@@ -1560,3 +1560,40 @@ setMethod("zipRDD",
15601560
15611561 PipelinedRDD(zippedRDD , partitionFunc )
15621562 })
1563+
1564+ # ' Intersection of this RDD and another one.
1565+ # '
1566+ # ' Return the intersection of this RDD and another one.
1567+ # ' The output will not contain any duplicate elements,
1568+ # ' even if the input RDDs did. Performs a hash partition
1569+ # ' across the cluster.
1570+ # ' Note that this method performs a shuffle internally.
1571+ # '
1572+ # ' @param x An RDD.
1573+ # ' @param other An RDD.
1574+ # ' @param numPartitions The number of partitions in the result RDD.
1575+ # ' @return An RDD which is the intersection of these two RDDs.
1576+ # ' @examples
1577+ # '\dontrun{
1578+ # ' sc <- sparkR.init()
1579+ # ' rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5))
1580+ # ' rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8))
1581+ # ' collect(sortBy(intersection(rdd1, rdd2), function(x) { x }))
1582+ # ' # list(1, 2, 3)
1583+ # '}
1584+ # ' @rdname intersection
1585+ # ' @aliases intersection,RDD
1586+ setMethod ("intersection ",
1587+ signature(x = " RDD" , other = " RDD" ),
1588+ function (x , other , numPartitions = SparkR :: numPartitions(x )) {
1589+ rdd1 <- map(x , function (v ) { list (v , NA ) })
1590+ rdd2 <- map(other , function (v ) { list (v , NA ) })
1591+
1592+ filterFunction <- function (elem ) {
1593+ iters <- elem [[2 ]]
1594+ all(as.vector(
1595+ lapply(iters , function (iter ) { length(iter ) > 0 }), mode = " logical" ))
1596+ }
1597+
1598+ keys(filterRDD(cogroup(rdd1 , rdd2 , numPartitions = numPartitions ), filterFunction ))
1599+ })
0 commit comments