Skip to content

Commit 44b1e00

Browse files
Closes #1130. No keys need on merge.data.table. No more deep copies.
1 parent 8bf3de4 commit 44b1e00

File tree

4 files changed

+98
-93
lines changed

4 files changed

+98
-93
lines changed

R/merge.R

Lines changed: 42 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,98 +2,75 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
22
all.y = all, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
33
if (!inherits(y, 'data.table')) {
44
y <- as.data.table(y)
5-
if (missing(by)) {
5+
if (missing(by) && missing(by.x)) {
66
by <- key(x)
77
}
88
}
99
if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.")
1010
if (any(duplicated(names(y)))) stop("y has some duplicated column name(s): ",paste(names(y)[duplicated(names(y))],collapse=","),". Please remove or rename the duplicate(s) and try again.")
1111

12-
## Determine by and rename columns of y, if by.x and by.y are supplied
13-
if (!is.null(by.x)){
14-
if (!is.null(by)) warning("Supplied both by and by.x and only by will be used")
15-
else {
16-
by <- by.x
17-
if (length(by.x) != length(by.y)) stop("by.x and by.y must be of the same length")
18-
setnames(y, by.y, by.x)
19-
on.exit(setnames(y, by.x, by.y))
20-
}
12+
## set up 'by'/'by.x'/'by.y'
13+
if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
14+
stop("`by.x` and `by.y` must be of same length.")
15+
if (!missing(by) && !missing(by.x))
16+
warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
17+
if (!is.null(by.x)) {
18+
if ( !is.character(by.x) || !is.character(by.y))
19+
stop("A non-empty vector of column names are required for `by.x` and `by.y`.")
20+
if (!all(by.x %in% names(x)))
21+
stop("Elements listed in `by.x` must be valid column names in x.")
22+
if (!all(by.y %in% names(y)))
23+
stop("Elements listed in `by.y` must be valid column names in y.")
24+
by = by.x
25+
names(by) = by.y
26+
} else {
27+
if (is.null(by))
28+
by = intersect(key(x), key(y))
29+
if (is.null(by))
30+
by = key(x)
31+
if (is.null(by))
32+
stop("Can not match keys in x and y to automatically determine appropriate `by` parameter. Please set `by` value explicitly.")
33+
if (length(by) == 0L || !is.character(by))
34+
stop("A non-empty vector of column names for `by` is required.")
35+
if (!all(by %in% intersect(colnames(x), colnames(y))))
36+
stop("Elements listed in `by` must be valid column names in x and y")
37+
by.x = by.y = by
2138
}
22-
23-
## Try to infer proper value for `by`
24-
if (is.null(by)) {
25-
by <- intersect(key(x), key(y))
26-
}
27-
if (is.null(by)) {
28-
by <- key(x)
29-
}
30-
if (is.null(by)) {
31-
stop("Can not match keys in x and y to automatically determine ",
32-
"appropriate `by` parameter. Please set `by` value explicitly.")
33-
}
34-
if (length(by) == 0L || !is.character(by)) {
35-
stop("A non-empty vector of column names for `by` is required.")
36-
}
37-
if (!all(by %in% intersect(colnames(x), colnames(y)))) {
38-
stop("Elements listed in `by` must be valid column names in x and y")
39-
}
40-
41-
## Checks to see that keys on dt are set and are in correct order
42-
.reset.keys <- function(dt, by) {
43-
dt.key <- key(dt)
44-
length(dt.key) < length(by) || !all(dt.key[1:length(by)] == by)
45-
}
46-
47-
if (.reset.keys(y, by)) {
48-
y=setkeyv(copy(y),by)
49-
# TO DO Add a secondary key here, when implemented which would be cached in the object
50-
}
51-
52-
xkey = if (identical(key(x),by)) x else setkeyv(copy(x),by)
53-
# TO DO: new [.data.table argument joincols or better name would allow leaving x as is if by was a head subset
54-
# of key(x). Also NAMED on each column would allow subset references. Also, a secondary key may be
55-
# much simpler but just need an argument to tell [.data.table to use the 2key of i.
56-
5739
# with i. prefix in v1.9.3, this goes away. Left here for now ...
5840
## sidestep the auto-increment column number feature-leading-to-bug by
5941
## ensuring no names end in ".1", see unit test
6042
## "merge and auto-increment columns in y[x]" in test-data.frame.like.R
61-
dupnames <- setdiff(intersect(names(xkey), names(y)), by)
43+
start = setdiff(names(x), by.x)
44+
end = setdiff(names(y), by.y)
45+
dupnames = intersect(start, end)
6246
if (length(dupnames)) {
63-
xkey = setnames(shallow(xkey), dupnames, sprintf("%s.", dupnames))
64-
y = setnames(shallow(y), dupnames, sprintf("%s.", dupnames))
47+
start[start %in% dupnames] = paste(dupnames, suffixes[1L], sep="")
48+
end[end %in% dupnames] = paste(dupnames, suffixes[2L], sep="")
6549
}
6650

67-
dt = y[xkey,nomatch=ifelse(all.x,NA,0),allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names)
51+
dt = y[x,nomatch=ifelse(all.x,NA,0),on=by,allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names)
6852

6953
if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed
7054
# Perhaps not very commonly used, so not a huge deal that the join is redone here.
7155
missingyidx = seq.int(nrow(y))
72-
whichy = y[xkey,which=TRUE,nomatch=0,allow.cartesian=allow.cartesian] # !!TO DO!!: Use not join (i=-xkey) here now that's implemented
56+
whichy = y[x,which=TRUE,nomatch=0,on=by,allow.cartesian=allow.cartesian] # !!TO DO!!: Use not join (i=-x) here now that's implemented
7357
whichy = whichy[whichy>0]
7458
if (length(whichy)) missingyidx = missingyidx[-whichy]
7559
if (length(missingyidx)) {
76-
yy <- y[missingyidx]
77-
othercolsx <- setdiff(names(xkey), by)
60+
yy = y[missingyidx]
61+
othercolsx = setdiff(names(x), by)
7862
if (length(othercolsx)) {
7963
tmp = rep.int(NA_integer_, length(missingyidx))
80-
yy <- cbind(yy, xkey[tmp, othercolsx, with = FALSE])
64+
yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
8165
}
8266
dt = rbind(dt, yy, use.names=FALSE) # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
8367
# takes care of #5672 without having to save names. This is how it should be, IMHO.
8468
}
8569
}
86-
87-
end = setdiff(names(y),by) # X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.
88-
setcolorder(dt,c(setdiff(names(dt),end),end))
89-
90-
if (nrow(dt) > 0) setkeyv(dt,by)
91-
92-
if (length(dupnames)) {
93-
setnames(dt, sprintf("%s.", dupnames), paste(dupnames, suffixes[2], sep=""))
94-
setnames(dt, sprintf("i.%s.", dupnames), paste(dupnames, suffixes[1], sep=""))
95-
}
96-
70+
# X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.
71+
newend = setdiff(names(y), by.y)
72+
setcolorder(dt, c(setdiff(names(dt), newend), newend))
73+
setnames(dt, c(by.x, start, end))
74+
if (nrow(dt) > 0) setkeyv(dt, by.x)
9775
dt
9876
}
99-

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,16 @@
6464

6565
21. `setDF` also converts `list` of equal length to `data.frame` by reference now. Closes [#1132](https://github.com/Rdatatable/data.table/issues/1132).
6666

67-
22. `merge.data.table` now has new arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637). Thanks to @NelloBlaser.
67+
22. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148).
6868

69-
23. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148).
69+
23. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784).
7070

71-
24. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784).
71+
24. `fread` gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027).
7272

73-
25. `fread` gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027).
73+
25. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.
74+
75+
22. `merge.data.table` gains arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637) and [#1130](https://github.com/Rdatatable/data.table/issues/1130). No copies are made even when the specified columns aren't key columns in data.tables, and therefore much more fast and memory efficient. Thanks to @blasern for the initial PRs.
7476

75-
26. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.
76-
7777
#### BUG FIXES
7878

7979
1. `if (TRUE) DT[,LHS:=RHS]` no longer prints, [#869](https://github.com/Rdatatable/data.table/issues/869) and [#1122](https://github.com/Rdatatable/data.table/issues/1122). Tests added. To get this to work we've had to live with one downside: if a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` or `print(DT)` is typed at the prompt, nothing will be printed. A repeated `DT` or `print(DT)` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `DT[]` at the prompt is guaranteed to print. As before, adding an extra `[]` on the end of a `:=` query is a recommended idiom to update and then print; e.g. `> DT[,foo:=3L][]`. Thanks to Jureiss and Jan Gorecki for reporting.

inst/tests/tests.Rraw

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6665,6 +6665,29 @@ if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not
66656665
unlink(f)
66666666
}
66676667

6668+
# rest of #1130 - merge doesn't copy, instead uses joins without keys.
6669+
set.seed(1L)
6670+
d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE)
6671+
d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE)
6672+
6673+
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by="A"))
6674+
ans2 <- setDF(merge(setDT(d1), setDT(d2), by="A"))
6675+
test(1543.1, ans1, ans2)
6676+
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), all=TRUE, by="A"))
6677+
ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A"))
6678+
test(1542.2, ans1, ans2)
6679+
# test duplicate name cases
6680+
setnames(d2, c("A", "Y"), c("B", "A"))
6681+
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B"))
6682+
ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B"))
6683+
test(1543.3, ans1, ans2)
6684+
ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A"))
6685+
ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A"))
6686+
test(1543.4, ans1, ans2)
6687+
ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A"))
6688+
ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A"))
6689+
test(1543.5, ans1, ans2)
6690+
66686691
##########################
66696692

66706693

man/merge.Rd

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
\alias{merge.data.table}
44
\title{ Merge Two Data Tables }
55
\description{
6-
Relatively quick merge of two \code{data.table}s based on common key columns (by default).
6+
Fast merge of two \code{data.table}s.
77

8-
This \code{merge} method for \code{data.table} is meant to act very similarly to the
9-
\code{merge} method for \code{data.frame}, with the major exception being that
10-
the default columns used to merge two \code{data.table} inputs are the shared key columns
11-
rather than the shared columns with the same names.
8+
This \code{merge} method for \code{data.table} behaves very similarly to that
9+
of \code{data.frame}s with one major exception: By default,
10+
the columns used to merge the \code{data.table}s are the shared key columns
11+
rather than the shared columns with the same names. Set the \code{by}, or \code{by.x},
12+
\code{by.y} arguments explicitly to override this default.
1213

1314
}
1415

@@ -71,23 +72,15 @@ allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE
7172
\details{
7273
\code{\link{merge}} is a generic function in base R. It dispatches to either the
7374
\code{merge.data.frame} method or \code{merge.data.table} method depending on the class of its first argument.
74-
Typing \code{?merge} at the prompt should present a choice of two links:
75-
the help pages for each of these \code{merge} methods. You don't need to use the full name of the
76-
method although you may if you wish; i.e., \code{merge(DT1,DT2)} is idiomatic R but you can bypass
77-
method dispatch by going direct if you wish: \code{merge.data.table(DT1,DT2)}.
78-
79-
Note that if the specified columns in \code{by} is not the key (or
80-
head of the key) of \code{x} or \code{y}, then a copy is first rekeyed
81-
prior to performing the merge. This might make this function perform
82-
slower than you are expecting. When secondary keys are implemented in
83-
future we expect performance in this case to improve.
84-
85-
For a more \code{data.table}-centric (and faster) way of merging two \code{data.table}s,
86-
see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. In recent
87-
versions, however, \code{merge()} is much closer to the speed of \code{x[y, ...]}.
88-
See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}.
89-
90-
Columns of numeric types (i.e., double) have their last two bytes rounded off while computing order, by defalult, to avoid any unexpected behaviour due to limitations in representing floating point numbers precisely. For large numbers (integers > 2^31), we recommend using \code{bit64::integer64}. Have a look at \code{\link{setNumericRounding}} to learn more.
75+
76+
In versions \code{< v1.9.6}, if the specified columns in \code{by} was not the key (or head of the key) of \code{x} or \code{y}, then a \code{\link{copy}} is first rekeyed prior to performing the merge. This was less performant and memory inefficient.
77+
78+
In version \code{v1.9.4} secondary keys was implemented. In \code{v1.9.6}, the concept of secondary keys has been
79+
extended to \code{merge}. No deep copies are made anymore and therefore very performant and memory efficient. Also there is better control for providing the columns to merge on with the help of newly implemented \code{by.x} and \code{by.y} arguments.
80+
81+
For a more \code{data.table}-centric way of merging two \code{data.table}s, see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}.
82+
83+
Merges on numeric columns: Columns of numeric types (i.e., double) have their last two bytes rounded off while computing order, by defalult, to avoid any unexpected behaviour due to limitations in representing floating point numbers precisely. For large numbers (integers > 2^31), we recommend using \code{bit64::integer64}. Have a look at \code{\link{setNumericRounding}} to learn more.
9184
9285
}
9386
@@ -138,6 +131,18 @@ merge(d4, d1)
138131
merge(d1, d4, all=TRUE)
139132
merge(d4, d1, all=TRUE)
140133
134+
# new feature, no need to set keys anymore
135+
set.seed(1L)
136+
d1 <- data.table(a=sample(rep(1:3,each=2)), z=1:6)
137+
d2 <- data.table(a=2:0, z=10:12)
138+
merge(d1, d2, by="a")
139+
merge(d1, d2, by="a", all=TRUE)
140+
141+
# new feature, using by.x and by.y arguments
142+
setnames(d2, "a", "b")
143+
merge(d1, d2, by.x="a", by.y="b")
144+
merge(d1, d2, by.x="a", by.y="b", all=TRUE)
145+
merge(d2, d1, by.x="b", by.y="a")
141146
}
142147
143148
\keyword{ data }

0 commit comments

Comments
 (0)