Skip to content

Commit

Permalink
Implement expand and complete (#225)
Browse files Browse the repository at this point in the history
Also bump minimum data.table version to 1.13.0
  • Loading branch information
markfairbanks authored Mar 19, 2021
1 parent 9c83443 commit 3360f20
Show file tree
Hide file tree
Showing 13 changed files with 383 additions and 10 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Depends:
R (>= 3.3)
Imports:
crayon,
data.table (>= 1.12.4),
data.table (>= 1.13.0),
dplyr (>= 1.0.3),
ellipsis,
glue,
Expand Down
8 changes: 6 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@

* `.data` and `.env` pronouns now work inside of `if_else()` (@markfairbanks, #220).

* More tidyr verbs are in the process of being added:

* More translations for tidyr verbs have been added:
* `drop_na()` (@markfairbanks, #194)

* `complete()` (@markfairbanks, #225)

* `expand()` (@markfairbanks, #225)

* `fill()` (@markfairbanks, #197)

* `pivot_longer()` (@markfairbanks, #204)
Expand Down
38 changes: 38 additions & 0 deletions R/complete.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#' Complete a data frame with missing combinations of data
#'
#' @description
#' This is a method for the tidyr `complete()` generic. This is a wrapper
#' around `dtplyr` translations for `expand()`, `full_join()`, and `replace_na()`
#' that's useful for completing missing combinations of data.
#'
#' @param data A [lazy_dt()].
#' @inheritParams tidyr::complete
#' @examples
#' library(tidyr)
#' tbl <- tibble(x = 1:2, y = 1:2, z = 3:4)
#' dt <- lazy_dt(tbl)
#'
#' dt %>%
#' complete(x, y)
#'
#' dt %>%
#' complete(x, y, fill = list(z = 10L))
# exported onLoad
complete.dtplyr_step <- function(data, ..., fill = list()) {
dots <- enquos(...)
dots <- dots[!vapply(dots, quo_is_null, logical(1))]
if (length(dots) == 0) {
return(data)
}

full <- tidyr::expand(data, !!!dots)
full <- dplyr::full_join(full, data, by = full$vars)
full <- tidyr::replace_na(full, replace = fill)
full
}

# exported onLoad
complete.data.table <- function(data, ..., fill = list()) {
data <- lazy_dt(data)
tidyr::complete(data, ..., fill = fill)
}
3 changes: 2 additions & 1 deletion R/step-join.R
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ step_subset_on <- function(x, y, i, on) {
vars = union(x$vars, y$vars),
i = y,
on = on,
locals = utils::modifyList(x$locals, y$locals)
locals = utils::modifyList(x$locals, y$locals),
allow_cartesian = TRUE
)
}

Expand Down
78 changes: 78 additions & 0 deletions R/step-subset-expand.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#' Expand data frame to include all possible combinations of values.
#'
#' @description
#' This is a method for the tidyr `expand()` generic. It is translated to
#' [data.table::CJ()].
#'
#' @param data A [lazy_dt()].
#' @inheritParams tidyr::expand
#' @examples
#' library(tidyr)
#'
#' fruits <- lazy_dt(tibble(
#' type = c("apple", "orange", "apple", "orange", "orange", "orange"),
#' year = c(2010, 2010, 2012, 2010, 2010, 2012),
#' size = factor(
#' c("XS", "S", "M", "S", "S", "M"),
#' levels = c("XS", "S", "M", "L")
#' ),
#' weights = rnorm(6, as.numeric(size) + 2)
#' ))
#'
#' # All possible combinations ---------------------------------------
#' # Note that all defined, but not necessarily present, levels of the
#' # factor variable `size` are retained.
#' fruits %>% expand(type)
#' fruits %>% expand(type, size)
#' fruits %>% expand(type, size, year)
#'
#' # Other uses -------------------------------------------------------
#' fruits %>% expand(type, size, 2010:2012)
#'
#' # Use `anti_join()` to determine which observations are missing
#' all <- fruits %>% expand(type, size, year)
#' all
#' all %>% dplyr::anti_join(fruits)
#'
#' # Use with `right_join()` to fill in missing rows
#' fruits %>% dplyr::right_join(all)
# exported onLoad
expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") {
dots <- capture_dots(data, ..., .j = FALSE)
dots <- dots[!vapply(dots, is_null, logical(1))]
if (length(dots) == 0) {
return(data)
}

named_dots <- have_name(dots)
if (any(!named_dots)) {
# Auto-names generated by enquos() don't always work with the CJ() step
## Ex: `1:3`
# Replicates the "V" naming convention data.table uses
symbol_dots <- vapply(dots, is_symbol, logical(1))
needs_v_name <- !symbol_dots & !named_dots
v_names <- paste0("V", 1:length(dots))
names(dots)[needs_v_name] <- v_names[needs_v_name]
names(dots)[symbol_dots] <- lapply(dots[symbol_dots], as_name)
}
names(dots) <- vctrs::vec_as_names(names(dots), repair = .name_repair)

on <- names(dots)
cj <- expr(CJ(!!!syms(on), unique = TRUE))

out <- distinct(data, !!!syms(data$groups), !!!dots)
if (length(data$groups) == 0) {
out <- step_subset(out, i = cj, on = on)
} else {
on <- call2(".", !!!syms(on))
out <- step_subset(out, j = expr(.SD[!!cj, on = !!on]))
}

out
}

# exported onLoad
expand.data.table <- function(data, ..., .name_repair = "check_unique") {
data <- lazy_dt(data)
tidyr::expand(data, ..., .name_repair = .name_repair)
}
4 changes: 2 additions & 2 deletions R/step-subset-slice.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, with_ties =
prop = expr(!!smaller_ranks(!!order_by, !!size$prop * .N, ties.method = ties.method))
)

step_subset_i(.data, i) %>%
arrange(!!order_by, .by_group = TRUE)
out <- step_subset_i(.data, i)
arrange(out, !!order_by, .by_group = TRUE)
}

smaller_ranks <- function(x, y, ties.method = "min") {
Expand Down
8 changes: 5 additions & 3 deletions R/step-subset.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ step_subset <- function(parent,
arrange = parent$arrange,
i = NULL,
j = NULL,
on = character()
) {
on = character(),
allow_cartesian = NULL
) {

stopifnot(is_step(parent))
stopifnot(is.null(i) || is_expression(i) || is_step(i))
Expand All @@ -22,6 +23,7 @@ step_subset <- function(parent,
i = i,
j = j,
on = on,
allow_cartesian = allow_cartesian,
implicit_copy = !is.null(i) || !is.null(j),
class = "dtplyr_step_subset"
)
Expand Down Expand Up @@ -117,7 +119,7 @@ dt_call.dtplyr_step_subset <- function(x, needs_copy = x$needs_copy) {

if (length(x$on) > 0) {
out$on <- call2(".", !!!syms(x$on))
out$allow.cartesian <- TRUE
out$allow.cartesian <- x$allow_cartesian
}
out
}
2 changes: 1 addition & 1 deletion R/tidyeval.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dt_eval <- function(x) {
# Make sure data.table functions are available so dtplyr still works
# even when data.table isn't attached
dt_funs <- c(
"copy", "dcast", "melt", "nafill",
"CJ", "copy", "dcast", "melt", "nafill",
"fcase", "fcoalesce", "fifelse", "fintersect", "frank", "frankv", "fsetdiff", "funion",
"setcolorder", "setnames"
)
Expand Down
4 changes: 4 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
register_s3_method("dplyr", "intersect", "data.table")
register_s3_method("dplyr", "setdiff", "data.table")
register_s3_method("dplyr", "union", "data.table")
register_s3_method("tidyr", "complete", "data.table")
register_s3_method("tidyr", "drop_na", "data.table")
register_s3_method("tidyr", "expand", "data.table")
register_s3_method("tidyr", "fill", "data.table")
register_s3_method("tidyr", "pivot_longer", "data.table")
register_s3_method("tidyr", "pivot_wider", "data.table")
Expand All @@ -14,7 +16,9 @@
register_s3_method("dplyr", "intersect", "dtplyr_step")
register_s3_method("dplyr", "setdiff", "dtplyr_step")
register_s3_method("dplyr", "union", "dtplyr_step")
register_s3_method("tidyr", "complete", "dtplyr_step")
register_s3_method("tidyr", "drop_na", "dtplyr_step")
register_s3_method("tidyr", "expand", "dtplyr_step")
register_s3_method("tidyr", "fill", "dtplyr_step")
register_s3_method("tidyr", "pivot_longer", "dtplyr_step")
register_s3_method("tidyr", "pivot_wider", "dtplyr_step")
Expand Down
52 changes: 52 additions & 0 deletions man/complete.dtplyr_step.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

84 changes: 84 additions & 0 deletions man/expand.dtplyr_step.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions tests/testthat/test-complete.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
test_that("complete with no variables returns data as is", {
mtcars_dt <- lazy_dt(mtcars, "DT")
expect_equal(complete(mtcars_dt), mtcars_dt)
})

test_that("basic invocation works", {
tbl <- tibble(x = 1:2, y = 1:2, z = 3:4)
dt <- lazy_dt(tbl, "DT")
out <- dt %>% complete(x, y) %>% collect()

expect_equal(nrow(out), 4)
expect_equal(out$z, c(3, NA, NA, 4))
})

test_that("empty expansion returns original", {
tbl <- tibble(x = character())
dt <- lazy_dt(tbl, "DT")
out <- dt %>% complete(y = NULL) %>% collect()
expect_equal(out, tbl)

tbl <- tibble(x = 1:4)
dt <- lazy_dt(tbl, "DT")
out <- dt %>% complete(y = NULL) %>% collect()
expect_equal(out, tbl)
})
Loading

0 comments on commit 3360f20

Please sign in to comment.