Closes #1130. No keys need on merge.data.table. No more deep copies.

arunsrinivasan · arunsrinivasan · commit 44b1e006af81 · 2015-08-10T02:22:07.000+02:00
diff --git a/R/merge.R b/R/merge.R
@@ -2,98 +2,75 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
                              all.y = all, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
     if (!inherits(y, 'data.table')) {
         y <- as.data.table(y)
-        if (missing(by)) {
+        if (missing(by) && missing(by.x)) {
             by <- key(x)
         }
     }
     if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.")
     if (any(duplicated(names(y)))) stop("y has some duplicated column name(s): ",paste(names(y)[duplicated(names(y))],collapse=","),". Please remove or rename the duplicate(s) and try again.")
     
-    ## Determine by and rename columns of y, if by.x and by.y are supplied
-    if (!is.null(by.x)){
-      if (!is.null(by)) warning("Supplied both by and by.x and only by will be used")
-      else {
-        by <- by.x
-        if (length(by.x) != length(by.y)) stop("by.x and by.y must be of the same length")
-        setnames(y, by.y, by.x)
-        on.exit(setnames(y, by.x, by.y))
-      }
+    ## set up 'by'/'by.x'/'by.y'
+    if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
+        stop("`by.x` and `by.y` must be of same length.")
+    if (!missing(by) && !missing(by.x))
+        warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
+    if (!is.null(by.x)) {
+        if ( !is.character(by.x) || !is.character(by.y))
+            stop("A non-empty vector of column names are required for `by.x` and `by.y`.")
+        if (!all(by.x %in% names(x)))
+            stop("Elements listed in `by.x` must be valid column names in x.")
+        if (!all(by.y %in% names(y)))
+            stop("Elements listed in `by.y` must be valid column names in y.")
+        by = by.x
+        names(by) = by.y
+    } else {
+        if (is.null(by))
+            by = intersect(key(x), key(y))
+        if (is.null(by)) 
+            by = key(x)
+        if (is.null(by))
+            stop("Can not match keys in x and y to automatically determine appropriate `by` parameter. Please set `by` value explicitly.")
+        if (length(by) == 0L || !is.character(by))
+            stop("A non-empty vector of column names for `by` is required.")
+        if (!all(by %in% intersect(colnames(x), colnames(y))))
+            stop("Elements listed in `by` must be valid column names in x and y")
+        by.x = by.y = by
     }
-    
-    ## Try to infer proper value for `by`
-    if (is.null(by)) {
-        by <- intersect(key(x), key(y))
-    }
-    if (is.null(by)) {
-        by <- key(x)
-    }
-    if (is.null(by)) {
-        stop("Can not match keys in x and y to automatically determine ",
-             "appropriate `by` parameter. Please set `by` value explicitly.")
-    }
-    if (length(by) == 0L || !is.character(by)) {
-        stop("A non-empty vector of column names for `by` is required.")
-    }
-    if (!all(by %in% intersect(colnames(x), colnames(y)))) {
-        stop("Elements listed in `by` must be valid column names in x and y")
-    }
-
-    ## Checks to see that keys on dt are set and are in correct order
-    .reset.keys <- function(dt, by) {
-        dt.key <- key(dt)
-        length(dt.key) < length(by) || !all(dt.key[1:length(by)] == by)
-    }
-
-    if (.reset.keys(y, by)) {
-        y=setkeyv(copy(y),by)
-        # TO DO Add a secondary key here, when implemented which would be cached in the object
-    }
-
-    xkey = if (identical(key(x),by)) x else setkeyv(copy(x),by)
-    # TO DO: new [.data.table argument joincols or better name would allow leaving x as is if by was a head subset
-    # of key(x). Also NAMED on each column would allow subset references. Also, a secondary key may be
-    # much simpler but just need an argument to tell [.data.table to use the 2key of i.
-
     # with i. prefix in v1.9.3, this goes away. Left here for now ...
     ## sidestep the auto-increment column number feature-leading-to-bug by
     ## ensuring no names end in ".1", see unit test
     ## "merge and auto-increment columns in y[x]" in test-data.frame.like.R
-    dupnames <- setdiff(intersect(names(xkey), names(y)), by)
+    start = setdiff(names(x), by.x)
+    end = setdiff(names(y), by.y)
+    dupnames = intersect(start, end)
     if (length(dupnames)) {
-        xkey = setnames(shallow(xkey), dupnames, sprintf("%s.", dupnames))
-        y = setnames(shallow(y), dupnames, sprintf("%s.", dupnames))
+        start[start %in% dupnames] = paste(dupnames, suffixes[1L], sep="")
+        end[end %in% dupnames] = paste(dupnames, suffixes[2L], sep="")
     }
 
-    dt = y[xkey,nomatch=ifelse(all.x,NA,0),allow.cartesian=allow.cartesian]   # includes JIS columns (with a i. prefix if conflict with x names)
+    dt = y[x,nomatch=ifelse(all.x,NA,0),on=by,allow.cartesian=allow.cartesian]   # includes JIS columns (with a i. prefix if conflict with x names)
 
     if (all.y && nrow(y)) {  # If y does not have any rows, no need to proceed
         # Perhaps not very commonly used, so not a huge deal that the join is redone here.
         missingyidx = seq.int(nrow(y))
-        whichy = y[xkey,which=TRUE,nomatch=0,allow.cartesian=allow.cartesian]  # !!TO DO!!:  Use not join (i=-xkey) here now that's implemented
+        whichy = y[x,which=TRUE,nomatch=0,on=by,allow.cartesian=allow.cartesian]  # !!TO DO!!:  Use not join (i=-x) here now that's implemented
         whichy = whichy[whichy>0]
         if (length(whichy)) missingyidx = missingyidx[-whichy]
         if (length(missingyidx)) {
-            yy <- y[missingyidx]
-            othercolsx <- setdiff(names(xkey), by)
+            yy = y[missingyidx]
+            othercolsx = setdiff(names(x), by)
             if (length(othercolsx)) {
                 tmp = rep.int(NA_integer_, length(missingyidx))
-                yy <- cbind(yy, xkey[tmp, othercolsx, with = FALSE])
+                yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
             }
             dt = rbind(dt, yy, use.names=FALSE) # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
                                                 # takes care of #5672 without having to save names. This is how it should be, IMHO.
         }
     }
-    
-    end = setdiff(names(y),by)     # X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.
-    setcolorder(dt,c(setdiff(names(dt),end),end))
-    
-    if (nrow(dt) > 0) setkeyv(dt,by)
-
-    if (length(dupnames)) {
-        setnames(dt, sprintf("%s.", dupnames), paste(dupnames, suffixes[2], sep=""))
-        setnames(dt, sprintf("i.%s.", dupnames), paste(dupnames, suffixes[1], sep=""))
-    }
-    
+    # X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.
+    newend = setdiff(names(y), by.y)
+    setcolorder(dt, c(setdiff(names(dt), newend), newend))
+    setnames(dt, c(by.x, start, end))
+    if (nrow(dt) > 0) setkeyv(dt, by.x)
     dt
 }
-
diff --git a/README.md b/README.md
@@ -64,16 +64,16 @@
 
   21. `setDF` also converts `list` of equal length to `data.frame` by reference now. Closes [#1132](https://github.com/Rdatatable/data.table/issues/1132).
 
-  22. `merge.data.table` now has new arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637). Thanks to @NelloBlaser.
+  22. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of  doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148). 
 
-  23. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of  doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148). 
+  23. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784).
 
-  24. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784).
+  24. `fread` gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027).
 
-  25. `fread` gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027).
+  25. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.
+
+22. `merge.data.table` gains arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637) and [#1130](https://github.com/Rdatatable/data.table/issues/1130). No copies are made even when the specified columns aren't key columns in data.tables, and therefore much more fast and memory efficient. Thanks to @blasern for the initial PRs.
 
-  26. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.
- 
 #### BUG FIXES
 
   1. `if (TRUE) DT[,LHS:=RHS]` no longer prints, [#869](https://github.com/Rdatatable/data.table/issues/869) and [#1122](https://github.com/Rdatatable/data.table/issues/1122). Tests added. To get this to work we've had to live with one downside: if a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` or `print(DT)` is typed at the prompt, nothing will be printed. A repeated `DT` or `print(DT)` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `DT[]` at the prompt is guaranteed to print. As before, adding an extra `[]` on the end of a `:=` query is a recommended idiom to update and then print; e.g. `> DT[,foo:=3L][]`. Thanks to Jureiss and Jan Gorecki for reporting.
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -6665,6 +6665,29 @@ if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) {  # in case R not
   unlink(f)
 }
 
+# rest of #1130 - merge doesn't copy, instead uses joins without keys.
+set.seed(1L)
+d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE)
+d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE)
+
+ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by="A"))
+ans2 <- setDF(merge(setDT(d1), setDT(d2), by="A"))
+test(1543.1, ans1, ans2)
+ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), all=TRUE, by="A"))
+ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A"))
+test(1542.2, ans1, ans2)
+# test duplicate name cases
+setnames(d2, c("A", "Y"), c("B", "A"))
+ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B"))
+ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B"))
+test(1543.3, ans1, ans2)
+ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A"))
+ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A"))
+test(1543.4, ans1, ans2)
+ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A"))
+ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A"))
+test(1543.5, ans1, ans2)
+
 ##########################
 
 
diff --git a/man/merge.Rd b/man/merge.Rd
@@ -3,12 +3,13 @@
 \alias{merge.data.table}
 \title{ Merge Two Data Tables }
 \description{
-  Relatively quick merge of two \code{data.table}s based on common key columns (by default).
+  Fast merge of two \code{data.table}s.
 
-  This \code{merge} method for \code{data.table} is meant to act very similarly to the
-  \code{merge} method for \code{data.frame}, with the major exception being that
-  the default columns used to merge two \code{data.table} inputs are the shared key columns
-  rather than the shared columns with the same names.
+  This \code{merge} method for \code{data.table} behaves very similarly to that
+  of \code{data.frame}s with one major exception: By default, 
+  the columns used to merge the \code{data.table}s are the shared key columns
+  rather than the shared columns with the same names. Set the \code{by}, or \code{by.x}, 
+  \code{by.y} arguments explicitly to override this default.
   
 }
 
@@ -71,23 +72,15 @@ allow.cartesian=getOption("datatable.allow.cartesian"),  # default FALSE
 \details{
   \code{\link{merge}} is a generic function in base R. It dispatches to either the
   \code{merge.data.frame} method or \code{merge.data.table} method depending on the class of its first argument.
-  Typing \code{?merge} at the prompt should present a choice of two links:
-  the help pages for each of these \code{merge} methods. You don't need to use the full name of the
-  method although you may if you wish; i.e., \code{merge(DT1,DT2)} is idiomatic R but you can bypass
-  method dispatch by going direct if you wish: \code{merge.data.table(DT1,DT2)}. 
-
-  Note that if the specified columns in \code{by} is not the key (or
-  head of the key) of \code{x} or \code{y}, then a copy is first rekeyed 
-  prior to performing the merge. This might make this function perform
-  slower than you are expecting. When secondary keys are implemented in
-  future we expect performance in this case to improve.
-
-  For a more \code{data.table}-centric (and faster) way of merging two \code{data.table}s,
-  see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. In recent
-  versions, however, \code{merge()} is much closer to the speed of \code{x[y, ...]}.
-  See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}.
-
-  Columns of numeric types (i.e., double) have their last two bytes rounded off while computing order, by defalult, to avoid any unexpected behaviour due to limitations in representing floating point numbers precisely. For large numbers (integers > 2^31), we recommend using \code{bit64::integer64}. Have a look at \code{\link{setNumericRounding}} to learn more.
+  
+  In versions \code{< v1.9.6}, if the specified columns in \code{by} was not the key (or head of the key) of \code{x} or \code{y}, then a \code{\link{copy}} is first rekeyed prior to performing the merge. This was less performant and memory inefficient. 
+
+  In version \code{v1.9.4} secondary keys was implemented. In \code{v1.9.6}, the concept of secondary keys has been 
+  extended to \code{merge}. No deep copies are made anymore and therefore very performant and memory efficient. Also there is better control for providing the columns to merge on with the help of newly implemented \code{by.x} and \code{by.y} arguments.
+
+  For a more \code{data.table}-centric way of merging two \code{data.table}s, see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}.
+
+  Merges on numeric columns: Columns of numeric types (i.e., double) have their last two bytes rounded off while computing order, by defalult, to avoid any unexpected behaviour due to limitations in representing floating point numbers precisely. For large numbers (integers > 2^31), we recommend using \code{bit64::integer64}. Have a look at \code{\link{setNumericRounding}} to learn more.
   
 }
 
@@ -138,6 +131,18 @@ merge(d4, d1)
 merge(d1, d4, all=TRUE)
 merge(d4, d1, all=TRUE)
 
+# new feature, no need to set keys anymore
+set.seed(1L)
+d1 <- data.table(a=sample(rep(1:3,each=2)), z=1:6)
+d2 <- data.table(a=2:0, z=10:12)
+merge(d1, d2, by="a")
+merge(d1, d2, by="a", all=TRUE)
+
+# new feature, using by.x and by.y arguments
+setnames(d2, "a", "b")
+merge(d1, d2, by.x="a", by.y="b")
+merge(d1, d2, by.x="a", by.y="b", all=TRUE)
+merge(d2, d1, by.x="b", by.y="a")
 }
 
 \keyword{ data }