Rdatatable · mattdowle · Mar 6, 2019 · Mar 5, 2019 · Mar 5, 2019 · Mar 5, 2019
@@ -32,6 +32,8 @@
 
 10. Quoted expression having `:=` and dot alias in RHS now works as expected. Thanks to @franknarf1 for raising up issue on [StackOverflow](https://stackoverflow.com/questions/41228076/using-data-tables-shortcut-in-quoted-expressions) and @jangorecki for PR.
 
+11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause.
+
 #### NOTES
 
 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background.

@@ -13454,7 +13454,7 @@ test(1986.4, DT[, as.list(colMeans(.SD)), by=gear], cbind(DT[,"gear"],DT[,-"gear
 # tests for #2949, #1974 and #1369 - dcast not able to handle functions referred to by a variable
 dt = data.table(
   x=sample(5,20,TRUE),
-  y=sample(2,20,TRUE), 
+  y=sample(2,20,TRUE),
   z=sample(letters[1:2], 20,TRUE),
   d1 = runif(20),
   d2=1L
@@ -13466,7 +13466,7 @@ myFun1 <- function(data, vars) {
 myFun2 <- function(data, vars) {
   myFuns <- list(f1=sum, first=function(x) x[1L])
   dcast.data.table(data, "x + y ~ z", value.var=vars, fun.aggregate=myFuns)
-} 
+}
 funs = list(sum, mean)
 vars = list("d1", "d2")
 
@@ -13508,7 +13508,7 @@ DT2 <- data.table(
   flight=sample(codes[1200:2000], 1e4, TRUE)
 )
 DT2[, c("start", "end") := .(pmin(start, end), pmax(start, end))]
-# just testing if the code runs without segfault .. 
+# just testing if the code runs without segfault ..
 test(1989.1, nrow(DT1[DT2, on=.(date <= end, date >= start, flight==flight)]) > 0L, TRUE)
 
 # fix for #2202, dcast needs to rank NA with na.last=FALSE in frankv within dcast
@@ -13537,9 +13537,9 @@ DT_y <- data.table(y1 = c(1,10), y2 = c(9,50), yval = c("y_a","y_b"), key = c("y
 test(1992.1, foverlaps(DT_x, DT_y), error="All rows with NA values")
 
 # foverlaps POSIXct checks #1143 + another check...
-xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L), 
+xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L),
                  hour = c(14L, 8L, 8L), min = c(30L, 0L, 30L), val = c(0.5, 0.3, 0.4),
-                 Date = structure(c(1167229800, 1167206400, 1167121800), 
+                 Date = structure(c(1167229800, 1167206400, 1167121800),
                           class = c("POSIXct", "POSIXt"), tzone = "UTC")) ## "UTC" time zone
 setDT(xp)[, `:=`(start = Date - 1800L, end = Date + 1800L)]
 tt <- as.POSIXct(c("2006/12/27 14:23:59", "2006/12/27 16:47:59", "2006/12/27 19:12:00"), format = "%Y/%m/%d %T", tz = "Asia/Jerusalem") ## different time zone
@@ -13618,6 +13618,14 @@ x = 5L
 test(1998.1, as.IDate(x), output = '1970-01-06')
 test(1998.2, class(x), 'integer')
 
+# a single NA at the beginning with no other nomatch would cause incorrect key, #3441
+dx = data.table(id = "A", key = "id")
+di = list(c("D", "A"))
+test(1999.1, key(dx[di]), NULL)
+dx = data.table(id = 1L, key = "id")
+di = list(z=c(2L, 1L))
+test(1999.2, key(dx[di]), NULL)
+
 
 ###################################
 #  Add new tests above this line  #

@@ -1244,23 +1244,20 @@ SEXP fsorted(SEXP x)
   return ScalarLogical(i==n);
 }
 
-SEXP isOrderedSubset(SEXP x, SEXP nrow)
+SEXP isOrderedSubset(SEXP x, SEXP nrowArg)
 // specialized for use in [.data.table only
 // Ignores 0s but heeds NAs and any out-of-range (which result in NA)
 {
-  int i=0, last, elem;
-  if (!length(x)) return(ScalarLogical(TRUE));
-  if (!isInteger(x)) error("x has non-0 length but isn't an integer vector");
-  if (!isInteger(nrow) || LENGTH(nrow)!=1 || INTEGER(nrow)[0]<0) error("nrow must be integer vector length 1 and >=0");
-  if (LENGTH(x)<=1) return(ScalarLogical(TRUE));
-  while (i<LENGTH(x) && INTEGER(x)[i]==0) i++;
-  if (i==LENGTH(x)) return(ScalarLogical(TRUE));
-  last = INTEGER(x)[i];  // the first non-0
-  i++;
-  for (; i<LENGTH(x); i++) {
-    elem = INTEGER(x)[i];
-    if (elem == 0) continue;
-    if (elem < last || elem < 0 || elem > INTEGER(nrow)[0])
+  if (!isNull(x) && !isInteger(x)) error("x must be either NULL or an integer vector");
+  if (length(x)<=1) return(ScalarLogical(TRUE));  // a single NA when length(x)==1 is ordered (e.g. tests 128 & 130) otherwise anyNA => FALSE
+  if (!isInteger(nrowArg) || LENGTH(nrowArg)!=1) error("nrow must be integer vector length 1");
+  const int nrow = INTEGER(nrowArg)[0];
+  if (nrow<0) error("nrow==%d but must be >=0", nrow);
+  const int *xd = INTEGER(x), xlen=LENGTH(x);
+  for (int i=0, last=INT_MIN; i<xlen; ++i) {
+    int elem = xd[i];
+    if (elem==0) continue;
+    if (elem<last || elem<0/*includes NA==INT_MIN*/ || elem>nrow)
       return(ScalarLogical(FALSE));
     last = elem;
   }