Rdatatable · MichaelChirico · Jan 27, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 14, 2025
@@ -11,8 +11,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
       by = key(x)
     }
   }
-  x0 = length(x)==0L
-  y0 = length(y)==0L
+  x0 = length(x) == 0L
+  y0 = length(y) == 0L
   if (x0 || y0) {
     if (x0 && y0)
       warningf("Neither of the input data.tables to join have columns.")
@@ -28,17 +28,19 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
   nm_y = names(y)
 
   ## set up 'by'/'by.x'/'by.y'
-  if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
+  if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y))
     stopf("`by.x` and `by.y` must be of same length.")
   if (!missing(by) && !missing(by.x))
-    warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
+    warningf("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.")
   if (!is.null(by.x)) {
-    if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
+    if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y))
       stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
-    if (!all(by.x %chin% nm_x))
-      stopf("Elements listed in `by.x` must be valid column names in x.")
-    if (!all(by.y %chin% nm_y))
-      stopf("Elements listed in `by.y` must be valid column names in y.")
+    if (!all(idx <- by.x %chin% nm_x)) {
+      stopf("The following columns listed in `%s` are missing from %s: %s", "by.x", "x", brackify(by.x[!idx]))
+    }
+    if (!all(idx <- by.y %chin% nm_y)) {
+      stopf("The following columns listed in `%s` are missing from %s: %s", "by.y", "y", brackify(by.y[!idx]))
+    }
     by = by.x
     names(by) = by.y
   } else {
@@ -50,8 +52,12 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
       by = intersect(nm_x, nm_y)
     if (length(by) == 0L || !is.character(by))
       stopf("A non-empty vector of column names for `by` is required.")
-    if (!all(by %chin% intersect(nm_x, nm_y)))
-      stopf("Elements listed in `by` must be valid column names in x and y")
+    if (!all(idx <- by %in% nm_x)) {
+      stopf("The following columns listed in `%s` are missing from %s: %s", "by", "x", brackify(by[!idx]))
+    }
+    if (!all(idx <- by %in% nm_y)) {
+      stopf("The following columns listed in `%s` are missing from %s: %s", "by", "y", brackify(by[!idx]))
+    }
     by = unname(by)
     by.x = by.y = by
   }

@@ -8567,14 +8567,13 @@ DT1 = data.table(a=1)
 test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
 test(1601.2, merge(DT1, DT0, by="a"),
      warning="Input data.table 'y' has no columns.",
-     error="Elements listed in `by`")
+     error="The following columns listed in `by` are missing from y: [a]")
 test(1601.3, merge(DT0, DT1, by="a"),
      warning="Input data.table 'x' has no columns.",
-     error="Elements listed in `by`")
+     error="The following columns listed in `by` are missing from x: [a]")
 test(1601.4, merge(DT0, DT0, by="a"),
      warning="Neither of the input data.tables to join have columns.",
-     error="Elements listed in `by`")
-
+     error="The following columns listed in `by` are missing from x: [a]")
 # fix for #1549
 d1 <- data.table(v1=1:2,x=x)
 d2 <- data.table(v1=3:4)
@@ -13546,14 +13545,14 @@ test(1962.016, merge(DT1, DT2, by.x = 'a', by.y = c('a', 'V')),
 test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
      data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
      warning = 'Supplied both.*argument will be ignored')
-test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
-     error = 'Elements listed in `by.x`')
-test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
-     error = 'Elements listed in `by.y`')
+test(1962.018, merge(DT1, DT2, by.x='z', by.y='a'),
+     error="The following columns listed in `by.x` are missing from x: [z]")
+test(1962.019, merge(DT1, DT2, by.x='a', by.y='z'),
+     error="The following columns listed in `by.y` are missing from y: [z]")
 test(1962.0201, merge(DT1, DT2, by=character(0L)), ans)  # was error before PR#5183
 test(1962.0202, merge(DT1, DT2, by=NULL),          ans)  # test explicit NULL too as missing() could be used inside merge()
-test(1962.021, merge(DT1, DT2, by = 'z'),
-     error = 'must be valid column names in x and y')
+test(1962.021, merge(DT1, DT2, by='z'),
+     error='The following columns listed in `by` are missing from x: [z]')
 
 ## frank.R
 x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
@@ -18014,7 +18013,7 @@ test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x
 test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5)))
 test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.")
 test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
-             merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
+             merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
 
 # weighted.mean GForce optimized, #3977
 old = options(datatable.optimize=1L)