Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Join without merge #240

Merged
merged 29 commits into from
Jul 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
3633dc4
hacky version of using `x[y]` join for all but `full_join()`
mgirlich May 4, 2021
9ec6a8c
refactor
mgirlich May 4, 2021
657156a
refactor
mgirlich May 4, 2021
03e730f
more refactoring
mgirlich May 4, 2021
46ecff8
use much simpler approach
mgirlich May 5, 2021
ff7d8b3
fix handling of duplicate column names
mgirlich May 5, 2021
17a5031
fix check() issues
mgirlich May 6, 2021
dd16fa2
copy the correct table in `left_join()`
mgirlich May 6, 2021
494a0ca
fix variable order in full_join() again
mgirlich May 7, 2021
4cb3d81
update documentation
mgirlich May 7, 2021
906a4bc
update news
mgirlich May 7, 2021
f99f397
improve `step_colorder()`
mgirlich May 7, 2021
395eb1e
refactor
mgirlich May 7, 2021
dde97a3
Merge commit '0c3f7eff202667aed11ab2ff0ffb71adcf0b5df6'
mgirlich May 11, 2021
76dc24b
fix `step_setnames()`: also subset `locs`
mgirlich May 11, 2021
cc69ecd
use `step_setnames()` and adapt tests
mgirlich May 11, 2021
b8e4391
Update R/step-join.R
mgirlich May 11, 2021
a817cfd
Update R/step-join.R
mgirlich May 11, 2021
1ede9d1
Update R/step-join.R
mgirlich May 11, 2021
76257cc
reorder cases for clarity
mgirlich May 11, 2021
9bf9bf4
refactor and actually respect suffixes
mgirlich May 11, 2021
ec50bc7
Merged origin/join-without-merge into join-without-merge
mgirlich May 11, 2021
81ea65b
`step_colorder()` checks for duplicate elements in `col_order`
mgirlich May 11, 2021
1c561a5
add new snapshot
mgirlich May 11, 2021
ef642db
Check if vars uniquely identified in `step_colorder()`
mgirlich May 12, 2021
b58a4ca
`step_colorder()` avoids work more often
mgirlich May 25, 2021
ef7c237
describe advantage of using `[` joins
mgirlich Jun 30, 2021
865b702
Change description of advantage
mgirlich Jul 1, 2021
13bd134
Merged upstream/master into join-without-merge
mgirlich Jul 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# dtplyr (development version)

* `left_join()`, `right_join()`, and `inner_join()` are now always translated to
hadley marked this conversation as resolved.
Show resolved Hide resolved
the `[.data.table` equivalent. For simple merges the translation gets a bit
longer but thanks to the simpler code base it helps to better handle
names in `by` and duplicated variables names produced in the data.table join
(@mgirlich, #222).

* `transmute()` doesn't produce duplicate columns when assigning to the same
variable (@mgirlich, #249).

Expand Down
37 changes: 31 additions & 6 deletions R/step-colorder.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,36 @@
step_colorder <- function(parent, col_order) {
stopifnot(is_step(parent))
stopifnot(is.character(col_order))
step_colorder <- function(x, col_order) {
stopifnot(is_step(x))
stopifnot(is.character(col_order) || is.integer(col_order))

step_call(parent,
if (any(duplicated(col_order))) {
abort("Every element of `col_order` must be unique.")
}

if (is.integer(col_order)) {
if (identical(col_order, seq_along(col_order))) {
return(x)
}
vars <- x$vars[col_order]
} else {
vars_selected <- x$vars[x$vars %in% col_order]
vars_count <- vctrs::vec_count(vars_selected)
vars_problematic <- vars_count$key[vars_count$count != 1]
if (!is_empty(vars_problematic)) {
vars_error <- paste0(vars_problematic, collapse = ", ")
msg <- paste0("The column(s) ", vars_error, " do not uniquely match a column in `x`.")
abort(msg)
}

if (identical(col_order, x$vars[seq_along(col_order)])) {
return(x)
}
vars <- col_order
}

step_call(x,
"setcolorder",
args = list(col_order),
vars = col_order,
in_place = !parent$implicit_copy
vars = vars,
in_place = !x$implicit_copy
)
}
229 changes: 145 additions & 84 deletions R/step-join.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,42 @@
step_join <- function(x, y, on, style, suffix = c(".x", ".y")) {
stopifnot(is_step(x))
stopifnot(is_step(y))
stopifnot(is.character(on))
stopifnot(is.null(on) || is.character(on))
style <- match.arg(style, c("inner", "full", "right", "left", "semi", "anti"))

style <- match.arg(style, c("inner", "full", "left", "semi", "anti"))
if (style %in% c("semi", "anti")) {
vars <- x$vars
} else {
vars <- join_vars(x$vars, y$vars, on, suffix)
}
on <- dplyr::common_by(on, x, y)

new_step(
parent = x,
vars_out_dt <- dt_join_vars(x$vars, y$vars, on$x, on$y, suffix = suffix, style = style)
colorder <- dt_join_colorder(x$vars, y$vars, on$x, on$y, style)

# TODO suppress warning in merge
# "column names ... are duplicated in the result
out <- new_step(
parent = if (style == "left") y else x,
implicit_copy = TRUE,
parent2 = y,
vars = vars,
on = on,
suffix = suffix,
parent2 = if (style == "left") x else y,
vars = vars_out_dt,
on = if (style %in% c("left", "full")) on else list(x = on$y, y = on$x),
style = style,
locals = utils::modifyList(x$locals, y$locals),
class = "dtplyr_step_join"
)

if (style %in% c("anti", "semi")) {
return(out)
}

out <- step_colorder(out, colorder)
hadley marked this conversation as resolved.
Show resolved Hide resolved

x_sim <- simulate_vars(x)
y_sim <- simulate_vars(y)
vars <- dplyr_join_vars(x_sim, y_sim, on$x, on$y, suffix = suffix)

if (any(duplicated(vars_out_dt))) {
step_setnames(out, colorder, vars, in_place = FALSE)
} else {
step_setnames(out, vars_out_dt[colorder], vars, in_place = FALSE)
}
}

#' @export
Expand All @@ -33,38 +49,32 @@ dt_sources.dtplyr_step_join <- function(x) {
dt_call.dtplyr_step_join <- function(x, needs_copy = x$needs_copy) {
lhs <- dt_call(x$parent, needs_copy)
rhs <- dt_call(x$parent2)
on <- call2(".", !!!syms(x$on))

by.x <- as.character(x$on)
by.y <- ifelse(names(x$on) == "", by.x, names(x$on))

call <- switch(x$style,
inner = call2("merge", lhs, rhs, all = FALSE, by.x = by.x, by.y = by.y, allow.cartesian = TRUE),
full = call2("merge", lhs, rhs, all = TRUE, by.x = by.x, by.y = by.y, allow.cartesian = TRUE),
left = call2("merge", lhs, rhs, all.x = TRUE, all.y = FALSE, by.x = by.x, by.y = by.y, allow.cartesian = TRUE),
semi = call2("[", lhs, call2("unique", call2("[", lhs, rhs, which = TRUE, nomatch = NULL, on = on))),
anti = call2("[", lhs, call2("!", rhs), on = on),
abort("Invalid style")
)
on2 <- simplify_names(stats::setNames(x$on$x, x$on$y))

# Hack on suffix if not the default
if (is_call(call, "merge") && !identical(x$suffix, c(".x", ".y"))) {
call$suffixes <- x$suffix
}
on <- call2(".", !!!syms(on2))

call
switch(x$style,
full = call2("merge", lhs, rhs, all = TRUE, by.x = x$on$x, by.y = x$on$y, allow.cartesian = TRUE),
left = call2("[", lhs, rhs, on = on, allow.cartesian = TRUE),
inner = call2("[", lhs, rhs, on = on, nomatch = NULL),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mgirlich does this deliberately not include allow.cartesian = TRUE? Or was it omitted by accident?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure this was by accident.

right = call2("[", lhs, rhs, on = on, allow.cartesian = TRUE),
anti = call2("[", lhs, call2("!", rhs), on = on),
semi = call2("[", lhs, call2("unique", call2("[", lhs, rhs, which = TRUE, nomatch = NULL, on = on)))
)
}

# dplyr verbs -------------------------------------------------------------

#' Join data tables
#'
#' These are methods for the dplyr generics [left_join()], [right_join()],
#' [inner_join()], [full_join()], [anti_join()], and [semi_join()]. The
#' mutating joins (left, right, inner, and full) are translated to
#' [data.table::merge.data.table()], except for the special cases where it's
#' possible to translate to `[.data.table`. Semi- and anti-joins have no
#' direct data.table equivalent.
#' [inner_join()], [full_join()], [anti_join()], and [semi_join()]. Left, right,
#' inner, and anti join are translated to the `[.data.table` equivalent,
#' full joins to [data.table::merge.data.table()].
#' Left, right, and full joins are in some cases followed by calls to
#' [data.table::setcolorder()] and [data.table::setnames()] to ensure that column
#' order and names match dplyr conventions.
#' Semi-joins don't have a direct data.table equivalent.
#'
#' @param x,y A pair of [lazy_dt()]s.
#' @inheritParams dplyr::left_join
Expand All @@ -85,16 +95,8 @@ dt_call.dtplyr_step_join <- function(x, needs_copy = x$needs_copy) {
#' band_dt %>% anti_join(instrument_dt)
left_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) {
y <- dtplyr_auto_copy(x, y, copy = copy)
by <- dtplyr_common_by(by, x, y)

if (join_is_simple(x$vars, y$vars, by)) {
col_order <- unique(c(x$vars, y$vars))
out <- step_subset_on(y, x, i = y, on = by)

step_colorder(out, col_order)
} else {
step_join(x, y, on = by, style = "left", suffix = suffix)
}
step_join(x, y, by, style = "left", suffix = suffix)
}

#' @export
Expand All @@ -107,13 +109,8 @@ left_join.data.table <- function(x, y, ...) {
#' @export
right_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) {
y <- dtplyr_auto_copy(x, y, copy = copy)
by <- dtplyr_common_by(by, y, x)

if (join_is_simple(x$vars, y$vars, by)) {
step_subset_on(x, y, i = y, on = by)
} else {
step_join(y, x, on = by, style = "left", suffix = suffix)
}
step_join(x, y, by, style = "right", suffix = suffix)
}

#' @export
Expand All @@ -123,21 +120,10 @@ right_join.data.table <- function(x, y, ...) {
}


step_subset_on <- function(x, y, i, on) {
step_subset(x,
vars = union(x$vars, y$vars),
i = y,
on = on,
locals = utils::modifyList(x$locals, y$locals),
allow_cartesian = TRUE
)
}

#' @importFrom dplyr inner_join
#' @export
inner_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) {
y <- dtplyr_auto_copy(x, y, copy = copy)
by <- dtplyr_common_by(by, x, y)

step_join(x, y, on = by, style = "inner", suffix = suffix)
}
Expand All @@ -152,7 +138,6 @@ inner_join.data.table <- function(x, y, ...) {
#' @export
full_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y")) {
y <- dtplyr_auto_copy(x, y, copy = copy)
by <- dtplyr_common_by(by, x, y)

step_join(x, y, on = by, style = "full", suffix = suffix)
}
Expand All @@ -167,7 +152,6 @@ full_join.data.table <- function(x, y, ...) {
#' @export
anti_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE) {
y <- dtplyr_auto_copy(x, y, copy = copy)
by <- dtplyr_common_by(by, x, y)

step_join(x, y, on = by, style = "anti")
}
Expand All @@ -181,9 +165,6 @@ anti_join.data.table <- function(x, y, ...) {
#' @importFrom dplyr semi_join
#' @export
semi_join.dtplyr_step <- function(x, y, ..., by = NULL, copy = FALSE) {
y <- dtplyr_auto_copy(x, y, copy = copy)
by <- dtplyr_common_by(by, x, y)

step_join(x, y, on = by, style = "semi")
}

Expand All @@ -195,11 +176,6 @@ semi_join.data.table <- function(x, y, ...) {

# helpers -----------------------------------------------------------------

dtplyr_common_by <- function(by, x, y) {
by <- dplyr::common_by(by, x, y)
simplify_names(stats::setNames(by$x, by$y))
}

dtplyr_auto_copy <- function(x, y, copy = copy) {
if (is_step(y)) {
y
Expand All @@ -210,25 +186,110 @@ dtplyr_auto_copy <- function(x, y, copy = copy) {
}
}

join_is_simple <- function(x, y, by) {
if (is_named(by)) {
return(FALSE)
add_suffixes <- function (x, y, suffix) {
x[x %in% y] <- paste0(x[x %in% y], suffix)
x
}

dplyr_join_vars <- function(x, y, on_x, on_y, suffix) {
colnames(left_join(x, y, by = stats::setNames(on_y, on_x), suffix = suffix))
}

dt_join_vars <- function(x, y, on_x, on_y, suffix, style) {
style <- match.arg(style, c("inner", "full", "right", "left", "semi", "anti"))

if (style == "left") {
# need to swap `x` and `y` as the data.table left join is `y[x, on]`
subset_join_vars(y, x, on_y = on_x)
} else if (style %in% c("right", "inner")) {
subset_join_vars(x, y, on_y)
} else if (style == "full") {
merge_vars(x, y, on_x, on_y, suffix)
} else {
x
}
}

common_vars <- setdiff(intersect(x, y), by)
length(common_vars) == 0
# column names as generated in `x[y, on = on]`
subset_join_vars <- function(x, y, on_y) {
# `y` variables used for joining are not included again
y_out <- setdiff(y, on_y)
# remaining `y` columns that are also in `x` get _prefixed_ by "i."
y_out[y_out %in% x] <- paste0("i.", y_out[y_out %in% x])
out_names <- c(x, y_out)

add_dt_suffix(out_names)
}

add_dt_suffix <- function(x) {
for (i in seq_along(x)) {
j <- 1
nm <- x[[i]]
first_occurrence <- !nm %in% x[seq(0, i - 1)]
if (!first_occurrence) {
while (nm %in% x[-i]) {
nm <- paste0(x[[i]], ".", j)
j <- j + 1
}
}
x[[i]] <- nm
}
x
}

join_vars <- function(x, y, on, suffixes) {
y <- setdiff(y, if (is_named(on)) names(on) else on)
vars <- union(x, y)
# column names as generated by `merge(x, y, by.x = on_x, by.y = on_y)`
merge_vars <- function(x, y, on_x, on_y, suffix = c(".x", ".y")) {
x <- setdiff(x, on_x)
y <- setdiff(y, on_y)

x_out <- add_suffixes(x, y, suffix[[1]])
y_out <- add_suffixes(y, x, suffix[[2]])

both <- intersect(x, y)
if (length(both) > 0) {
vars <- c(setdiff(vars, both), paste0(both, suffixes[[1]]), paste0(both, suffixes[[2]]))
c(on_x, x_out, y_out)
}

dt_join_colorder <- function(x, y, on_x, on_y, style) {
style <- match.arg(style, c("inner", "full", "right", "left", "semi", "anti"))

if (style == "left") {
subset_left_join_colorder(x, y, on_x, on_y)
} else if (style == "full") {
merge_join_colorder(x, y, on_x, on_y)
} else {
seq(length(x) + length(y) - length(on_x))
}
}

#' column order of data.table left join `y[x]` compared to `left_join(y, x)`
#' @noRd
subset_left_join_colorder <- function(x, y, on_x, on_y) {
# variable order
# y[x, on]: y-vars, x-vars - on_x
# left_join(x, y, on): x-vars, y-vars - on_y

x_out_dt <- setdiff(x, on_x)
x_loc <- vctrs::vec_match(x, x_out_dt) + length(y)
x_loc[is.na(x_loc)] <- vctrs::vec_match(on_y, y)

y_out_dt <- setdiff(y, on_y)
y_loc <- vctrs::vec_match(y_out_dt, y)

c(x_loc, y_loc)
}

merge_join_colorder <- function(x, y, on_x, on_y) {
# variable order
# merge(x, y, on_x, on_y): on_x, x-vars - on_x, y-vars - on_y
# full_join(x, y, on): x-vars, y-vars - on_y

x_out_dt <- setdiff(x, on_x)
x_loc <- vctrs::vec_match(x, x_out_dt) + length(on_x)
x_loc[is.na(x_loc)] <- vctrs::vec_match(x[is.na(x_loc)], on_x)

n_x <- length(x)
n_y_out <- length(y) - length(on_x)

vars
c(x_loc, n_x + seq2(1, n_y_out))
}

#' @importFrom dplyr same_src
Expand Down
1 change: 1 addition & 0 deletions R/step-setnames.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ step_setnames <- function(x, old, new, in_place, rename_groups = FALSE) {
name_changed <- x$vars[locs] != new
old <- old[name_changed]
new <- new[name_changed]
locs <- locs[name_changed]

if (length(old) == 0) {
return(x)
Expand Down
Loading