Skip to content

Commit

Permalink
address multiple matches during update-on-join #3747
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Apr 19, 2020
1 parent cf73fcf commit b64c0c3
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 12 deletions.
31 changes: 26 additions & 5 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ replace_dot_alias = function(e) {
}
}

"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult, roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
# test explicitly if the caller is [.data.table (even stronger test. TO DO.)
Expand Down Expand Up @@ -183,7 +183,12 @@ replace_dot_alias = function(e) {
}
return(x)
}
if (!mult %chin% c("first","last","all","error")) stop("mult argument can only be 'first', 'last', 'all' or 'error'")
missingmult = missing(mult)
if (!missingmult) {
if (!mult %chin% c("first","last","all","error")) stop("mult argument can only be 'first', 'last', 'all' or 'error'")
} else {
mult = "all"
}
missingroll = missing(roll)
if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
if (is.character(roll)) {
Expand Down Expand Up @@ -212,6 +217,7 @@ replace_dot_alias = function(e) {
..syms = NULL
av = NULL
jsub = NULL
jassign = FALSE
if (!missing(j)) {
jsub = replace_dot_alias(substitute(j))
root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
Expand Down Expand Up @@ -284,6 +290,7 @@ replace_dot_alias = function(e) {
warning("nomatch isn't relevant together with :=, ignoring nomatch")
nomatch=0L
}
jassign = TRUE
}
}

Expand Down Expand Up @@ -442,7 +449,7 @@ replace_dot_alias = function(e) {
ops = rep(1L, length(leftcols))
}
# Implementation for not-join along with by=.EACHI, #604
if (notjoin && (byjoin || (mult=="first" || mult=="last"))) { # mult != "all" needed for #1571
if (notjoin && (byjoin || (mult=="first" || mult=="last"))) { # mult != "all|error" needed for #1571
notjoin = FALSE
if (verbose) {last.started.at=proc.time();cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()}
orignames = copy(names(i))
Expand All @@ -452,8 +459,22 @@ replace_dot_alias = function(e) {
setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
}
i = .shallow(i, retain.key = TRUE)
if (!missingmult && jassign && missingby) { ## if we just could swap x and i for jassign, all 'mult' cases, dups check via ans$allLens1 would already work
if (mult=="first") stop("Argument mult='first' during update-on-join not yet implemented")
else if (mult=="all") {
warning("Argument 'mult' during update-on-join must not be equal to 'all'. For backward compatibility it will be set to 'last'. To avoid this warning do not use mult arg or provide value other than 'all'.")
mult = "last"
} # mult=="error" # handled after bmerge, not instantly
}
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
if (mult=="error") mult="all" ## there was no multiple matches so we can proceed as if 'all'
dups = NULL # so we can re-use later
if (jassign && missingby && mult!="last" && (dups<-anyDuplicated(ans$starts, incomparables = c(0L, NA_integer_)))) { # warn here if duplicated matches occured, as proposed in #3747, ask users to use mult='last' explicitly
if (missingmult)
warning("During update-on-join there were multiple matches, in such case the last matching row will be used to lookup the value from. To avoid this warning use mult argument 'last' (default), 'first' (not yet implemented) or 'error'.")
else if (mult=="error")
stop("mult='error' and multiple matches during merge") # same error as in bmerge
}
if (mult=="error") mult="all" ## there were no multiple matches (error would have been raised already) so we can proceed as if 'all', or probably even as if any first or last as well
xo = ans$xo ## to make it available for further use.
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
# 'setorder', as there's another 'setorder' in generating 'irows' below...
Expand All @@ -480,7 +501,7 @@ replace_dot_alias = function(e) {
irows = if (allLen1) f__ else vecseq(f__,len__,
if (allow.cartesian ||
notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
!anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
!(if (!is.null(dups)) dups else anyDuplicated(f__, incomparables = c(0L, NA_integer_)))) {
NULL # #742. If 'i' has no duplicates, ignore
} else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
Expand Down
2 changes: 1 addition & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -5814,7 +5814,7 @@ DT <- data.table(id=rep(letters[1:2], 2), var = rnorm(4), key="id")
test(1383.1, DT[letters[1:3], list(var)], DT[1:5, list(var)])
# Fix for #800 - allow.cartesian should be ignored if jsub[1L] has `:=`.
DT=data.table(id=c(1,1), date=c(1992,1991), value=c(4.1,4.5), key="id")
test(1383.2, copy(DT)[DT, a:=1], DT[, a := 1])
test(1383.2, copy(DT)[DT, a:=1], DT[, a := 1], warning="last matching row will be used") #4370

# Somehow DT[col==max(col)] was never tested, broken by auto-indexing new in v1.9.4, #858
DT = data.table(a = c(1,1,1,2,2,2,3,3,3), b = rnorm(9))
Expand Down
12 changes: 6 additions & 6 deletions man/data.table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
\method{[}{data.table}(x, i, j, by, keyby, with = TRUE,
nomatch = getOption("datatable.nomatch", NA),
mult = "all",
mult,
roll = FALSE,
rollends = if (roll=="nearest") c(TRUE,TRUE)
else if (roll>=0) c(FALSE,TRUE)
Expand Down Expand Up @@ -57,7 +57,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
If \code{i} is a \code{data.table}, the columns in \code{i} to be matched against \code{x} can be specified using one of these ways:
\itemize{
\item{\code{on} argument (see below). It allows for both \code{equi-} and the newly implemented \code{non-equi} joins.}
\item{\code{on} argument (see below). It allows for both \emph{equi} and \emph{non-equi} joins.}
\item{If not, \code{x} \emph{must be keyed}. Key can be set using \code{\link{setkey}}. If \code{i} is also keyed, then first \emph{key} column of \code{i} is matched against first \emph{key} column of \code{x}, second against second, etc..
Expand All @@ -67,9 +67,9 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
}
Using \code{on=} is recommended (even during keyed joins) as it helps understand the code better and also allows for \emph{non-equi} joins.
When the binary operator \code{==} alone is used, an \emph{equi} join is performed. In SQL terms, \code{x[i]} then performs a \emph{right join} by default. \code{i} prefixed with \code{!} signals a \emph{not-join} or \emph{not-select}.
When the binary operator \code{==} alone is used, an \emph{equi} join is performed. In SQL terms, \code{x[i]} then performs a \emph{right outer join} by default. \code{i} prefixed with \code{!} signals a \emph{not-join} or \emph{not-select}.
Support for \emph{non-equi} join was recently implemented, which allows for other binary operators \code{>=, >, <= and <}.
Support for \emph{non-equi} join is implemented, which allows for other binary operators \code{>=, >, <= and <}.
See \href{../doc/datatable-keys-fast-subset.html}{\code{vignette("datatable-keys-fast-subset")}} and \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}.
Expand Down Expand Up @@ -118,7 +118,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
\item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. Use \code{options(datatable.nomatch=NULL)} to change the default value (used when \code{nomatch} is not supplied).}
\item{mult}{ When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}) and \emph{multiple} rows in \code{x} match to the row in \code{i}, \code{mult} controls which are returned: \code{"all"} (default), \code{"first"} or \code{"last"}.}
\item{mult}{ When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}) and \emph{multiple} rows in \code{x} match to the row in \code{i}, \code{mult} controls which are returned: \code{"all"}, \code{"first"}, \code{"last"} or \code{"error"}. Default is \code{"all"} unless \emph{update-on-join} (join and \code{:=}) is performed, then \code{mult} is \code{"last"}. }
\item{roll}{ When \code{i} is a \code{data.table} and its row matches to all but the last \code{x} join column, and its value in the last \code{i} join column falls in a gap (including after the last observation in \code{x} for that group), then:
Expand Down Expand Up @@ -216,7 +216,7 @@ A \code{data.table} is a \code{list} of vectors, just like a \code{data.frame}.
\item it has enhanced functionality in \code{[.data.table} for fast joins of keyed tables, fast aggregation, fast last observation carried forward (LOCF) and fast add/modify/delete of columns by reference with no copy at all.
}
See the \code{see also} section for the several other \emph{methods} that are available for operating on data.tables efficiently.
See the \emph{see also} section for the several other \emph{methods} that are available for operating on data.tables efficiently.
}
\references{
Expand Down

0 comments on commit b64c0c3

Please sign in to comment.