Skip to content

Commit 14e39e4

Browse files
Use not-join in merge to improve performance on full joins.
1 parent 44b1e00 commit 14e39e4

File tree

1 file changed

+8
-9
lines changed

1 file changed

+8
-9
lines changed

R/merge.R

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
22
all.y = all, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
3-
if (!inherits(y, 'data.table')) {
4-
y <- as.data.table(y)
3+
if (!is.data.table(y)) {
4+
y = as.data.table(y)
55
if (missing(by) && missing(by.x)) {
6-
by <- key(x)
6+
by = key(x)
77
}
88
}
99
if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.")
@@ -52,19 +52,18 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
5252

5353
if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed
5454
# Perhaps not very commonly used, so not a huge deal that the join is redone here.
55-
missingyidx = seq.int(nrow(y))
56-
whichy = y[x,which=TRUE,nomatch=0,on=by,allow.cartesian=allow.cartesian] # !!TO DO!!: Use not join (i=-x) here now that's implemented
57-
whichy = whichy[whichy>0]
58-
if (length(whichy)) missingyidx = missingyidx[-whichy]
55+
missingyidx = y[!x,which=TRUE,on=by,allow.cartesian=allow.cartesian]
5956
if (length(missingyidx)) {
6057
yy = y[missingyidx]
6158
othercolsx = setdiff(names(x), by)
6259
if (length(othercolsx)) {
6360
tmp = rep.int(NA_integer_, length(missingyidx))
61+
# TO DO: use set() here instead..
6462
yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
6563
}
66-
dt = rbind(dt, yy, use.names=FALSE) # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
67-
# takes care of #5672 without having to save names. This is how it should be, IMHO.
64+
# empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
65+
# takes care of #5672 without having to save names. This is how it should be, IMHO.
66+
dt = rbind(dt, yy, use.names=FALSE)
6867
}
6968
}
7069
# X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.

0 commit comments

Comments
 (0)