tidyverse · hadley · Mar 19, 2021 · Mar 11, 2021 · Mar 11, 2021 · Mar 11, 2021
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,7 +19,7 @@ Depends:
     R (>= 3.3)
 Imports:
     crayon,
-    data.table (>= 1.12.4),
+    data.table (>= 1.13.0),
     dplyr (>= 1.0.3),
     ellipsis,
     glue,

diff --git a/NEWS.md b/NEWS.md
@@ -22,10 +22,12 @@
 
 * `.data` and `.env` pronouns now work inside of `if_else()` (@markfairbanks, #220).
 
-* More tidyr verbs are in the process of being added:
-
+* More translations for tidyr verbs have been added:
+  
   * `drop_na()` (@markfairbanks, #194)
 
+  * `expand()` (@markfairbanks, #225)
+
   * `fill()` (@markfairbanks, #197)
 
   * `pivot_longer()` (@markfairbanks, #204)

diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R
@@ -0,0 +1,80 @@
+#' Expand data frame to include all possible combinations of values.
+#'
+#' @description
+#' This is a method for the tidyr `expand()` generic. It is translated to
+#' [data.table::CJ()].
+#'
+#' @param data A [lazy_dt()].
+#' @inheritParams tidyr::expand
+#' @examples
+#' library(tidyr)
+#'
+#' fruits <- lazy_dt(tibble(
+#'   type   = c("apple", "orange", "apple", "orange", "orange", "orange"),
+#'   year   = c(2010, 2010, 2012, 2010, 2010, 2012),
+#'   size  =  factor(
+#'     c("XS", "S",  "M", "S", "S", "M"),
+#'     levels = c("XS", "S", "M", "L")
+#'   ),
+#'   weights = rnorm(6, as.numeric(size) + 2)
+#' ))
+#'
+#' # All possible combinations ---------------------------------------
+#' # Note that all defined, but not necessarily present, levels of the
+#' # factor variable `size` are retained.
+#' fruits %>% expand(type)
+#' fruits %>% expand(type, size)
+#' fruits %>% expand(type, size, year)
+#'
+#' # Other uses -------------------------------------------------------
+#' fruits %>% expand(type, size, 2010:2012)
+#'
+#' # Use `anti_join()` to determine which observations are missing
+#' all <- fruits %>% expand(type, size, year)
+#' all
+#' all %>% dplyr::anti_join(fruits)
+#'
+#' # Use with `right_join()` to fill in missing rows
+#' fruits %>% dplyr::right_join(all)
+# exported onLoad
+expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") {
+  dots <- capture_dots(data, ..., .j = FALSE)
+  dots <- dots[!vapply(dots, is_null, logical(1))]
+  if (length(dots) == 0) {
+    return(data)
+  }
+
+  named_dots <- have_name(dots)
+  if (any(!named_dots)) {
+    # Auto-names generated by enquos() don't always work with the CJ() step
+      ## Ex: `1:3`
+    # Replicates the "V" naming convention data.table uses
+    symbol_dots <- vapply(dots, is_symbol, logical(1))
+    needs_v_name <- !symbol_dots & !named_dots
+    v_names <- paste0("V", 1:length(dots))
+    names(dots)[needs_v_name] <- v_names[needs_v_name]
+    names(dots)[symbol_dots] <- lapply(dots[symbol_dots], as_name)
+  }
+  names(dots) <- vctrs::vec_as_names(names(dots), repair = .name_repair)
+
+  on <- names(dots)
+
+  out <- distinct(data, !!!syms(data$groups), !!!dots)
+
+  cj <- expr(CJ(!!!syms(on), unique = TRUE))
+
+  if (length(data$groups) == 0) {
+    out <- step_subset(out, i = cj, on = on, allow_cartesian = FALSE)
+  } else {
+    on <- call2(".", !!!syms(on))
+    out <- step_subset(out, j = expr(.SD[!!cj, on = !!on]), allow_cartesian = FALSE)
+  }
+
+  out
+}
+
+# exported onLoad
+expand.data.table <- function(data, ..., .name_repair = "check_unique") {
+  data <- lazy_dt(data)
+  tidyr::expand(data, ..., .name_repair = .name_repair)
+}
diff --git a/R/step-subset-slice.R b/R/step-subset-slice.R
@@ -157,8 +157,9 @@ slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, with_ties =
     prop = expr(!!smaller_ranks(!!order_by, !!size$prop * .N, ties.method = ties.method))
   )
 
-  step_subset_i(.data, i) %>%
-    arrange(!!order_by, .by_group = TRUE)
+  out <- step_subset_i(.data, i)
+
+  arrange(out, !!order_by, .by_group = TRUE)
 }
 
 smaller_ranks <- function(x, y, ties.method = "min") {

diff --git a/R/step-subset.R b/R/step-subset.R
@@ -5,8 +5,9 @@ step_subset <- function(parent,
                         arrange = parent$arrange,
                         i = NULL,
                         j = NULL,
-                        on = character()
-                        ) {
+                        on = character(),
+                        allow_cartesian = NULL
+) {
 
   stopifnot(is_step(parent))
   stopifnot(is.null(i) || is_expression(i) || is_step(i))
@@ -22,6 +23,7 @@ step_subset <- function(parent,
     i = i,
     j = j,
     on = on,
+    allow_cartesian = allow_cartesian,
     implicit_copy = !is.null(i) || !is.null(j),
     class = "dtplyr_step_subset"
   )
@@ -117,7 +119,11 @@ dt_call.dtplyr_step_subset <- function(x, needs_copy = x$needs_copy) {
 
   if (length(x$on) > 0) {
     out$on <- call2(".", !!!syms(x$on))
-    out$allow.cartesian <- TRUE
+    if (is.null(x$allow_cartesian)) {
+      out$allow.cartesian <- TRUE
+    } else {
+      out$allow.cartesian <- parent$allow_cartesian
+    }
-    if (is.null(x$allow_cartesian)) {
-      out$allow.cartesian <- TRUE
-    } else {
-      out$allow.cartesian <- parent$allow_cartesian
-    }
+    out$allow.cartesian <- parent$allow_cartesian %||% TRUE
-    if (is.null(x$allow_cartesian)) {
-      out$allow.cartesian <- TRUE
-    } else {
-      out$allow.cartesian <- parent$allow_cartesian
-    }
+    out$allow.cartesian <- parent$allow_cartesian %||% TRUE
   }
   out
 }
diff --git a/R/tidyeval.R b/R/tidyeval.R
@@ -14,7 +14,7 @@ dt_eval <- function(x) {
 # Make sure data.table functions are available so dtplyr still works
 # even when data.table isn't attached
 dt_funs <- c(
-  "copy", "dcast", "melt", "nafill",
+  "CJ", "copy", "dcast", "melt", "nafill",
   "fcase", "fcoalesce", "fifelse", "fintersect", "frank", "frankv", "fsetdiff", "funion",
   "setcolorder", "setnames"
 )

diff --git a/R/zzz.R b/R/zzz.R
@@ -5,6 +5,7 @@
   register_s3_method("dplyr", "setdiff", "data.table")
   register_s3_method("dplyr", "union", "data.table")
   register_s3_method("tidyr", "drop_na", "data.table")
+  register_s3_method("tidyr", "expand", "data.table")
   register_s3_method("tidyr", "fill", "data.table")
   register_s3_method("tidyr", "pivot_longer", "data.table")
   register_s3_method("tidyr", "pivot_wider", "data.table")
@@ -15,6 +16,7 @@
   register_s3_method("dplyr", "setdiff", "dtplyr_step")
   register_s3_method("dplyr", "union", "dtplyr_step")
   register_s3_method("tidyr", "drop_na", "dtplyr_step")
+  register_s3_method("tidyr", "expand", "dtplyr_step")
   register_s3_method("tidyr", "fill", "dtplyr_step")
   register_s3_method("tidyr", "pivot_longer", "dtplyr_step")
   register_s3_method("tidyr", "pivot_wider", "dtplyr_step")

diff --git a/man/expand.dtplyr_step.Rd b/man/expand.dtplyr_step.Rd
diff --git a/tests/testthat/test-step-subset-expand.R b/tests/testthat/test-step-subset-expand.R
@@ -0,0 +1,85 @@
+test_that("expand completes all values", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, y)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(x, y)])[CJ(x, y, unique = TRUE), on = .(x, y)])
+  )
+  expect_equal(step$vars, c("x", "y"))
+  expect_equal(nrow(out), 4)
+})
+
+test_that("multiple variables in one arg doesn't expand", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, c(x, y))
+  out <- collect(step)
+
+  expect_equal(nrow(out), 2)
+})
+
+test_that("works with unnamed vectors", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, 1:2)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(x = x, V2 = 1:2)])[CJ(x, V2, unique = TRUE), on = .(x, V2)])
+  )
+  expect_equal(step$vars, c("x", "V2"))
+  expect_equal(nrow(out), 4)
+})
+
+test_that("works with named vectors", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, val = 1:2)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(x = x, val = 1:2)])[CJ(x, val, unique = TRUE), on = .(x, val)])
+  )
+  expect_equal(step$vars, c("x", "val"))
+  expect_equal(nrow(out), 4)
+})
+
+test_that("expand respects groups", {
+  tbl <- tibble(
+    a = c(1L, 1L, 2L),
+    b = c(1L, 2L, 1L),
+    c = c(2L, 1L, 1L)
+  )
+  dt <- lazy_dt(tbl, "DT")
+  step <- dt %>% group_by(c) %>% expand(a, b)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(c, a, b)])[, .SD[CJ(a, b, unique = TRUE), on = .(a, b)], keyby = .(c)])
+  )
+  expect_equal(step$vars, c("c", "a", "b"))
+  expect_equal(out$a, c(1, 1, 2, 2, 1))
+  expect_equal(out$b, c(1, 2, 1, 2, 1))
+})
+
+test_that("NULL inputs", {
+  tbl <- tibble(x = 1:5)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, y = NULL)
+  out <- collect(step)
+  expect_equal(out, tbl)
+})
+
+test_that("expand respects .name_repair", {
+  dt <- lazy_dt(tibble(x = 1:2), "DT")
+
+  suppressMessages(
+    expect_named(dt %>% expand(x, x, .name_repair = "unique") %>% collect(), c("x...1", "x...2"))
+  )
+})