Skip to content

Commit 21e9b74

Browse files
Merge pull request apache#145 from lythesia/master
[SPARKR-175] Refactor join code
2 parents 76f6b9e + 1c2dbec commit 21e9b74

File tree

2 files changed

+66
-128
lines changed

2 files changed

+66
-128
lines changed

pkg/R/RDD.R

Lines changed: 8 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ setMethod("checkpoint",
314314
#'\dontrun{
315315
#' sc <- sparkR.init()
316316
#' rdd <- parallelize(sc, 1:10, 2L)
317-
#' numParititions(rdd) # 2L
317+
#' numPartitions(rdd) # 2L
318318
#'}
319319
setGeneric("numPartitions", function(rdd) { standardGeneric("numPartitions") })
320320

@@ -1603,32 +1603,7 @@ setMethod("join",
16031603
rdd2Tagged <- lapply(rdd2, function(x) { list(x[[1]], list(2L, x[[2]])) })
16041604

16051605
doJoin <- function(v) {
1606-
t1 <- vector("list", length(v))
1607-
t2 <- vector("list", length(v))
1608-
index1 <- 1
1609-
index2 <- 1
1610-
for (x in v) {
1611-
if (x[[1]] == 1L) {
1612-
t1[[index1]] <- x[[2]]
1613-
index1 <- index1 + 1
1614-
} else {
1615-
t2[[index2]] <- x[[2]]
1616-
index2 <- index2 + 1
1617-
}
1618-
}
1619-
length(t1) <- index1 - 1
1620-
length(t2) <- index2 - 1
1621-
1622-
result <- list()
1623-
length(result) <- length(t1) * length(t2)
1624-
index <- 1
1625-
for (i in t1) {
1626-
for (j in t2) {
1627-
result[[index]] <- list(i, j)
1628-
index <- index + 1
1629-
}
1630-
}
1631-
result
1606+
joinTaggedList(v, list(FALSE, FALSE))
16321607
}
16331608

16341609
joined <- flatMapValues(groupByKey(unionRDD(rdd1Tagged, rdd2Tagged), numPartitions), doJoin)
@@ -1668,37 +1643,7 @@ setMethod("leftOuterJoin",
16681643
rdd2Tagged <- lapply(rdd2, function(x) { list(x[[1]], list(2L, x[[2]])) })
16691644

16701645
doJoin <- function(v) {
1671-
t1 <- vector("list", length(v))
1672-
t2 <- vector("list", length(v))
1673-
index1 <- 1
1674-
index2 <- 1
1675-
for (x in v) {
1676-
if (x[[1]] == 1L) {
1677-
t1[[index1]] <- x[[2]]
1678-
index1 <- index1 + 1
1679-
} else {
1680-
t2[[index2]] <- x[[2]]
1681-
index2 <- index2 + 1
1682-
}
1683-
}
1684-
length(t1) <- index1 - 1
1685-
len2 <- index2 - 1
1686-
if (len2 == 0) {
1687-
t2 <- list(NULL)
1688-
} else {
1689-
length(t2) <- len2
1690-
}
1691-
1692-
result <- list()
1693-
length(result) <- length(t1) * length(t2)
1694-
index <- 1
1695-
for (i in t1) {
1696-
for (j in t2) {
1697-
result[[index]] <- list(i, j)
1698-
index <- index + 1
1699-
}
1700-
}
1701-
result
1646+
joinTaggedList(v, list(FALSE, TRUE))
17021647
}
17031648

17041649
joined <- flatMapValues(groupByKey(unionRDD(rdd1Tagged, rdd2Tagged), numPartitions), doJoin)
@@ -1738,37 +1683,7 @@ setMethod("rightOuterJoin",
17381683
rdd2Tagged <- lapply(rdd2, function(x) { list(x[[1]], list(2L, x[[2]])) })
17391684

17401685
doJoin <- function(v) {
1741-
t1 <- vector("list", length(v))
1742-
t2 <- vector("list", length(v))
1743-
index1 <- 1
1744-
index2 <- 1
1745-
for (x in v) {
1746-
if (x[[1]] == 1L) {
1747-
t1[[index1]] <- x[[2]]
1748-
index1 <- index1 + 1
1749-
} else {
1750-
t2[[index2]] <- x[[2]]
1751-
index2 <- index2 + 1
1752-
}
1753-
}
1754-
len1 <- index1 - 1
1755-
if (len1 == 0) {
1756-
t1 <- list(NULL)
1757-
} else {
1758-
length(t1) <- len1
1759-
}
1760-
length(t2) <- index2 - 1
1761-
1762-
result <- list()
1763-
length(result) <- length(t1) * length(t2)
1764-
index <- 1
1765-
for (i in t1) {
1766-
for (j in t2) {
1767-
result[[index]] <- list(i, j)
1768-
index <- index + 1
1769-
}
1770-
}
1771-
result
1686+
joinTaggedList(v, list(TRUE, FALSE))
17721687
}
17731688

17741689
joined <- flatMapValues(groupByKey(unionRDD(rdd1Tagged, rdd2Tagged), numPartitions), doJoin)
@@ -1798,59 +1713,24 @@ setMethod("rightOuterJoin",
17981713
#' rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
17991714
#' fullOuterJoin(rdd1, rdd2, 2L) # list(list(1, list(2, 1)),
18001715
#' # list(1, list(3, 1)),
1801-
#' # list(3, list(3, NULL)),
18021716
#' # list(2, list(NULL, 4)))
1717+
#' # list(3, list(3, NULL)),
18031718
#'}
18041719
setGeneric("fullOuterJoin", function(rdd1, rdd2, numPartitions) { standardGeneric("fullOuterJoin") })
18051720

18061721
#' @rdname fullOuterJoin
18071722
#' @aliases fullOuterJoin,RDD,RDD-method
1723+
18081724
setMethod("fullOuterJoin",
18091725
signature(rdd1 = "RDD", rdd2 = "RDD", numPartitions = "integer"),
18101726
function(rdd1, rdd2, numPartitions) {
18111727
rdd1Tagged <- lapply(rdd1, function(x) { list(x[[1]], list(1L, x[[2]])) })
18121728
rdd2Tagged <- lapply(rdd2, function(x) { list(x[[1]], list(2L, x[[2]])) })
18131729

18141730
doJoin <- function(v) {
1815-
t1 <- vector("list", length(v))
1816-
t2 <- vector("list", length(v))
1817-
index1 <- 1
1818-
index2 <- 1
1819-
for (x in v) {
1820-
if (x[[1]] == 1L) {
1821-
t1[[index1]] <- x[[2]]
1822-
index1 <- index1 + 1
1823-
} else {
1824-
t2[[index2]] <- x[[2]]
1825-
index2 <- index2 + 1
1826-
}
1827-
}
1828-
len1 <- index1 - 1
1829-
len2 <- index2 - 1
1830-
1831-
if (len1 == 0) {
1832-
t1 <- list(NULL)
1833-
} else {
1834-
length(t1) <- len1
1835-
}
1836-
1837-
if (len2 == 0) {
1838-
t2 <- list(NULL)
1839-
} else {
1840-
length(t2) <- len2
1841-
}
1842-
1843-
result <- list()
1844-
length(result) <- length(t1) * length(t2)
1845-
index <- 1
1846-
for(i in t1) {
1847-
for(j in t2) {
1848-
result[[index]] <- list(i, j)
1849-
index <- index + 1
1850-
}
1851-
}
1852-
result
1731+
joinTaggedList(v, list(TRUE, TRUE))
18531732
}
1733+
18541734
joined <- flatMapValues(groupByKey(unionRDD(rdd1Tagged, rdd2Tagged), numPartitions), doJoin)
18551735
})
18561736

pkg/R/utils.R

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,61 @@ sortKeyValueList <- function(kv_list) {
201201
keys <- sapply(kv_list, function(x) x[[1]])
202202
kv_list[order(keys)]
203203
}
204+
205+
# Utility function to generate compact R lists from grouped rdd
206+
# Used in Join-family functions
207+
# param:
208+
# tagged_list R list generated via groupByKey with tags(1L, 2L, ...)
209+
# cnull Boolean list where each element determines whether the corresponding list should
210+
# be converted to list(NULL)
211+
genCompactLists <- function(tagged_list, cnull) {
212+
len <- length(tagged_list)
213+
lists <- list(vector("list", len), vector("list", len))
214+
index <- list(1, 1)
215+
216+
for (x in tagged_list) {
217+
tag <- x[[1]]
218+
idx <- index[[tag]]
219+
lists[[tag]][[idx]] <- x[[2]]
220+
index[[tag]] <- idx + 1
221+
}
222+
223+
len <- lapply(index, function(x) x - 1)
224+
for (i in (1:2)) {
225+
if (cnull[[i]] && len[[i]] == 0) {
226+
lists[[i]] <- list(NULL)
227+
} else {
228+
length(lists[[i]]) <- len[[i]]
229+
}
230+
}
231+
232+
lists
233+
}
234+
235+
# Utility function to merge compact R lists
236+
# Used in Join-family functions
237+
# param:
238+
# left/right Two compact lists ready for Cartesian product
239+
mergeCompactLists <- function(left, right) {
240+
result <- list()
241+
length(result) <- length(left) * length(right)
242+
index <- 1
243+
for (i in left) {
244+
for (j in right) {
245+
result[[index]] <- list(i, j)
246+
index <- index + 1
247+
}
248+
}
249+
result
250+
}
251+
252+
# Utility function to wrapper above two operations
253+
# Used in Join-family functions
254+
# param (same as genCompactLists):
255+
# tagged_list R list generated via groupByKey with tags(1L, 2L, ...)
256+
# cnull Boolean list where each element determines whether the corresponding list should
257+
# be converted to list(NULL)
258+
joinTaggedList <- function(tagged_list, cnull) {
259+
lists <- genCompactLists(tagged_list, cnull)
260+
mergeCompactLists(lists[[1]], lists[[2]])
261+
}

0 commit comments

Comments
 (0)