From 3360f20738adde68195ca59f3c824bdeaed32a52 Mon Sep 17 00:00:00 2001
From: Mark Fairbanks <mark.t.fairbanks@gmail.com>
Date: Fri, 19 Mar 2021 15:29:10 -0700
Subject: [PATCH] Implement expand and complete (#225)

Also bump minimum data.table version to 1.13.0
---
 DESCRIPTION                              |  2 +-
 NEWS.md                                  |  8 ++-
 R/complete.R                             | 38 +++++++++++
 R/step-join.R                            |  3 +-
 R/step-subset-expand.R                   | 78 ++++++++++++++++++++++
 R/step-subset-slice.R                    |  4 +-
 R/step-subset.R                          |  8 ++-
 R/tidyeval.R                             |  2 +-
 R/zzz.R                                  |  4 ++
 man/complete.dtplyr_step.Rd              | 52 +++++++++++++++
 man/expand.dtplyr_step.Rd                | 84 +++++++++++++++++++++++
 tests/testthat/test-complete.R           | 25 +++++++
 tests/testthat/test-step-subset-expand.R | 85 ++++++++++++++++++++++++
 13 files changed, 383 insertions(+), 10 deletions(-)
 create mode 100644 R/complete.R
 create mode 100644 R/step-subset-expand.R
 create mode 100644 man/complete.dtplyr_step.Rd
 create mode 100644 man/expand.dtplyr_step.Rd
 create mode 100644 tests/testthat/test-complete.R
 create mode 100644 tests/testthat/test-step-subset-expand.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 738aa6076..94439d37b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -19,7 +19,7 @@ Depends:
     R (>= 3.3)
 Imports:
     crayon,
-    data.table (>= 1.12.4),
+    data.table (>= 1.13.0),
     dplyr (>= 1.0.3),
     ellipsis,
     glue,
diff --git a/NEWS.md b/NEWS.md
index 3e66b9f14..e9fddd732 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -22,10 +22,14 @@
 
 * `.data` and `.env` pronouns now work inside of `if_else()` (@markfairbanks, #220).
 
-* More tidyr verbs are in the process of being added:
-
+* More translations for tidyr verbs have been added:
+  
   * `drop_na()` (@markfairbanks, #194)
   
+  * `complete()` (@markfairbanks, #225)
+  
+  * `expand()` (@markfairbanks, #225)
+  
   * `fill()` (@markfairbanks, #197)
   
   * `pivot_longer()` (@markfairbanks, #204)
diff --git a/R/complete.R b/R/complete.R
new file mode 100644
index 000000000..668b9b7d3
--- /dev/null
+++ b/R/complete.R
@@ -0,0 +1,38 @@
+#' Complete a data frame with missing combinations of data
+#'
+#' @description
+#' This is a method for the tidyr `complete()` generic. This is a wrapper
+#' around `dtplyr` translations for `expand()`, `full_join()`, and `replace_na()`
+#' that's useful for completing missing combinations of data.
+#'
+#' @param data A [lazy_dt()].
+#' @inheritParams tidyr::complete
+#' @examples
+#' library(tidyr)
+#' tbl <- tibble(x = 1:2, y = 1:2, z = 3:4)
+#' dt <- lazy_dt(tbl)
+#'
+#' dt %>%
+#'   complete(x, y)
+#'
+#' dt %>%
+#'   complete(x, y, fill = list(z = 10L))
+# exported onLoad
+complete.dtplyr_step <- function(data, ..., fill = list()) {
+  dots <- enquos(...)
+  dots <- dots[!vapply(dots, quo_is_null, logical(1))]
+  if (length(dots) == 0) {
+    return(data)
+  }
+
+  full <- tidyr::expand(data, !!!dots)
+  full <- dplyr::full_join(full, data, by = full$vars)
+  full <- tidyr::replace_na(full, replace = fill)
+  full
+}
+
+# exported onLoad
+complete.data.table <- function(data, ..., fill = list()) {
+  data <- lazy_dt(data)
+  tidyr::complete(data, ..., fill = fill)
+}
diff --git a/R/step-join.R b/R/step-join.R
index f8d0379da..9da8944ac 100644
--- a/R/step-join.R
+++ b/R/step-join.R
@@ -128,7 +128,8 @@ step_subset_on <- function(x, y, i, on) {
     vars = union(x$vars, y$vars),
     i = y,
     on = on,
-    locals = utils::modifyList(x$locals, y$locals)
+    locals = utils::modifyList(x$locals, y$locals),
+    allow_cartesian = TRUE
   )
 }
 
diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R
new file mode 100644
index 000000000..fefa753fa
--- /dev/null
+++ b/R/step-subset-expand.R
@@ -0,0 +1,78 @@
+#' Expand data frame to include all possible combinations of values.
+#'
+#' @description
+#' This is a method for the tidyr `expand()` generic. It is translated to
+#' [data.table::CJ()].
+#'
+#' @param data A [lazy_dt()].
+#' @inheritParams tidyr::expand
+#' @examples
+#' library(tidyr)
+#'
+#' fruits <- lazy_dt(tibble(
+#'   type   = c("apple", "orange", "apple", "orange", "orange", "orange"),
+#'   year   = c(2010, 2010, 2012, 2010, 2010, 2012),
+#'   size  =  factor(
+#'     c("XS", "S",  "M", "S", "S", "M"),
+#'     levels = c("XS", "S", "M", "L")
+#'   ),
+#'   weights = rnorm(6, as.numeric(size) + 2)
+#' ))
+#'
+#' # All possible combinations ---------------------------------------
+#' # Note that all defined, but not necessarily present, levels of the
+#' # factor variable `size` are retained.
+#' fruits %>% expand(type)
+#' fruits %>% expand(type, size)
+#' fruits %>% expand(type, size, year)
+#'
+#' # Other uses -------------------------------------------------------
+#' fruits %>% expand(type, size, 2010:2012)
+#'
+#' # Use `anti_join()` to determine which observations are missing
+#' all <- fruits %>% expand(type, size, year)
+#' all
+#' all %>% dplyr::anti_join(fruits)
+#'
+#' # Use with `right_join()` to fill in missing rows
+#' fruits %>% dplyr::right_join(all)
+# exported onLoad
+expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") {
+  dots <- capture_dots(data, ..., .j = FALSE)
+  dots <- dots[!vapply(dots, is_null, logical(1))]
+  if (length(dots) == 0) {
+    return(data)
+  }
+
+  named_dots <- have_name(dots)
+  if (any(!named_dots)) {
+    # Auto-names generated by enquos() don't always work with the CJ() step
+      ## Ex: `1:3`
+    # Replicates the "V" naming convention data.table uses
+    symbol_dots <- vapply(dots, is_symbol, logical(1))
+    needs_v_name <- !symbol_dots & !named_dots
+    v_names <- paste0("V", 1:length(dots))
+    names(dots)[needs_v_name] <- v_names[needs_v_name]
+    names(dots)[symbol_dots] <- lapply(dots[symbol_dots], as_name)
+  }
+  names(dots) <- vctrs::vec_as_names(names(dots), repair = .name_repair)
+
+  on <- names(dots)
+  cj <- expr(CJ(!!!syms(on), unique = TRUE))
+
+  out <- distinct(data, !!!syms(data$groups), !!!dots)
+  if (length(data$groups) == 0) {
+    out <- step_subset(out, i = cj, on = on)
+  } else {
+    on <- call2(".", !!!syms(on))
+    out <- step_subset(out, j = expr(.SD[!!cj, on = !!on]))
+  }
+
+  out
+}
+
+# exported onLoad
+expand.data.table <- function(data, ..., .name_repair = "check_unique") {
+  data <- lazy_dt(data)
+  tidyr::expand(data, ..., .name_repair = .name_repair)
+}
diff --git a/R/step-subset-slice.R b/R/step-subset-slice.R
index a5b8cae20..35614c8b1 100644
--- a/R/step-subset-slice.R
+++ b/R/step-subset-slice.R
@@ -157,8 +157,8 @@ slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, with_ties =
     prop = expr(!!smaller_ranks(!!order_by, !!size$prop * .N, ties.method = ties.method))
   )
 
-  step_subset_i(.data, i) %>%
-    arrange(!!order_by, .by_group = TRUE)
+  out <- step_subset_i(.data, i)
+  arrange(out, !!order_by, .by_group = TRUE)
 }
 
 smaller_ranks <- function(x, y, ties.method = "min") {
diff --git a/R/step-subset.R b/R/step-subset.R
index 86bbefcb9..00ee8634d 100644
--- a/R/step-subset.R
+++ b/R/step-subset.R
@@ -5,8 +5,9 @@ step_subset <- function(parent,
                         arrange = parent$arrange,
                         i = NULL,
                         j = NULL,
-                        on = character()
-                        ) {
+                        on = character(),
+                        allow_cartesian = NULL
+) {
 
   stopifnot(is_step(parent))
   stopifnot(is.null(i) || is_expression(i) || is_step(i))
@@ -22,6 +23,7 @@ step_subset <- function(parent,
     i = i,
     j = j,
     on = on,
+    allow_cartesian = allow_cartesian,
     implicit_copy = !is.null(i) || !is.null(j),
     class = "dtplyr_step_subset"
   )
@@ -117,7 +119,7 @@ dt_call.dtplyr_step_subset <- function(x, needs_copy = x$needs_copy) {
 
   if (length(x$on) > 0) {
     out$on <- call2(".", !!!syms(x$on))
-    out$allow.cartesian <- TRUE
+    out$allow.cartesian <- x$allow_cartesian
   }
   out
 }
diff --git a/R/tidyeval.R b/R/tidyeval.R
index b97f29545..704064762 100644
--- a/R/tidyeval.R
+++ b/R/tidyeval.R
@@ -14,7 +14,7 @@ dt_eval <- function(x) {
 # Make sure data.table functions are available so dtplyr still works
 # even when data.table isn't attached
 dt_funs <- c(
-  "copy", "dcast", "melt", "nafill",
+  "CJ", "copy", "dcast", "melt", "nafill",
   "fcase", "fcoalesce", "fifelse", "fintersect", "frank", "frankv", "fsetdiff", "funion",
   "setcolorder", "setnames"
 )
diff --git a/R/zzz.R b/R/zzz.R
index 0537891b0..82c810ca8 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -4,7 +4,9 @@
   register_s3_method("dplyr", "intersect", "data.table")
   register_s3_method("dplyr", "setdiff", "data.table")
   register_s3_method("dplyr", "union", "data.table")
+  register_s3_method("tidyr", "complete", "data.table")
   register_s3_method("tidyr", "drop_na", "data.table")
+  register_s3_method("tidyr", "expand", "data.table")
   register_s3_method("tidyr", "fill", "data.table")
   register_s3_method("tidyr", "pivot_longer", "data.table")
   register_s3_method("tidyr", "pivot_wider", "data.table")
@@ -14,7 +16,9 @@
   register_s3_method("dplyr", "intersect", "dtplyr_step")
   register_s3_method("dplyr", "setdiff", "dtplyr_step")
   register_s3_method("dplyr", "union", "dtplyr_step")
+  register_s3_method("tidyr", "complete", "dtplyr_step")
   register_s3_method("tidyr", "drop_na", "dtplyr_step")
+  register_s3_method("tidyr", "expand", "dtplyr_step")
   register_s3_method("tidyr", "fill", "dtplyr_step")
   register_s3_method("tidyr", "pivot_longer", "dtplyr_step")
   register_s3_method("tidyr", "pivot_wider", "dtplyr_step")
diff --git a/man/complete.dtplyr_step.Rd b/man/complete.dtplyr_step.Rd
new file mode 100644
index 000000000..d53b93f6a
--- /dev/null
+++ b/man/complete.dtplyr_step.Rd
@@ -0,0 +1,52 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/complete.R
+\name{complete.dtplyr_step}
+\alias{complete.dtplyr_step}
+\title{Complete a data frame with missing combinations of data}
+\usage{
+\method{complete}{dtplyr_step}(data, ..., fill = list())
+}
+\arguments{
+\item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.}
+
+\item{...}{Specification of columns to expand. Columns can be atomic vectors
+or lists.
+\itemize{
+\item To find all unique combinations of \code{x}, \code{y} and \code{z}, including those not
+present in the data, supply each variable as a separate argument:
+\code{expand(df, x, y, z)}.
+\item To find only the combinations that occur in the
+data, use \code{nesting}: \code{expand(df, nesting(x, y, z))}.
+\item You can combine the two forms. For example,
+\code{expand(df, nesting(school_id, student_id), date)} would produce
+a row for each present school-student combination for all possible
+dates.
+}
+
+When used with factors, \code{expand()} uses the full set of levels, not just
+those that appear in the data. If you want to use only the values seen in
+the data, use \code{forcats::fct_drop()}.
+
+When used with continuous variables, you may need to fill in values
+that do not appear in the data: to do so use expressions like
+\code{year = 2010:2020} or \code{year = full_seq(year,1)}.}
+
+\item{fill}{A named list that for each variable supplies a single value to
+use instead of \code{NA} for missing combinations.}
+}
+\description{
+This is a method for the tidyr \code{complete()} generic. This is a wrapper
+around \code{dtplyr} translations for \code{expand()}, \code{full_join()}, and \code{replace_na()}
+that's useful for completing missing combinations of data.
+}
+\examples{
+library(tidyr)
+tbl <- tibble(x = 1:2, y = 1:2, z = 3:4)
+dt <- lazy_dt(tbl)
+
+dt \%>\%
+  complete(x, y)
+
+dt \%>\%
+  complete(x, y, fill = list(z = 10L))
+}
diff --git a/man/expand.dtplyr_step.Rd b/man/expand.dtplyr_step.Rd
new file mode 100644
index 000000000..31f4698c7
--- /dev/null
+++ b/man/expand.dtplyr_step.Rd
@@ -0,0 +1,84 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/step-subset-expand.R
+\name{expand.dtplyr_step}
+\alias{expand.dtplyr_step}
+\title{Expand data frame to include all possible combinations of values.}
+\usage{
+\method{expand}{dtplyr_step}(data, ..., .name_repair = "check_unique")
+}
+\arguments{
+\item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.}
+
+\item{...}{Specification of columns to expand. Columns can be atomic vectors
+or lists.
+\itemize{
+\item To find all unique combinations of \code{x}, \code{y} and \code{z}, including those not
+present in the data, supply each variable as a separate argument:
+\code{expand(df, x, y, z)}.
+\item To find only the combinations that occur in the
+data, use \code{nesting}: \code{expand(df, nesting(x, y, z))}.
+\item You can combine the two forms. For example,
+\code{expand(df, nesting(school_id, student_id), date)} would produce
+a row for each present school-student combination for all possible
+dates.
+}
+
+When used with factors, \code{expand()} uses the full set of levels, not just
+those that appear in the data. If you want to use only the values seen in
+the data, use \code{forcats::fct_drop()}.
+
+When used with continuous variables, you may need to fill in values
+that do not appear in the data: to do so use expressions like
+\code{year = 2010:2020} or \code{year = full_seq(year,1)}.}
+
+\item{.name_repair}{Treatment of problematic column names:
+\itemize{
+\item \code{"minimal"}: No name repair or checks, beyond basic existence,
+\item \code{"unique"}: Make sure names are unique and not empty,
+\item \code{"check_unique"}: (default value), no name repair, but check they are
+\code{unique},
+\item \code{"universal"}: Make the names \code{unique} and syntactic
+\item a function: apply custom name repair (e.g., \code{.name_repair = make.names}
+for names in the style of base R).
+\item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}}
+}
+
+This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}.
+See there for more details on these terms and the strategies used
+to enforce them.}
+}
+\description{
+This is a method for the tidyr \code{expand()} generic. It is translated to
+\code{\link[data.table:J]{data.table::CJ()}}.
+}
+\examples{
+library(tidyr)
+
+fruits <- lazy_dt(tibble(
+  type   = c("apple", "orange", "apple", "orange", "orange", "orange"),
+  year   = c(2010, 2010, 2012, 2010, 2010, 2012),
+  size  =  factor(
+    c("XS", "S",  "M", "S", "S", "M"),
+    levels = c("XS", "S", "M", "L")
+  ),
+  weights = rnorm(6, as.numeric(size) + 2)
+))
+
+# All possible combinations ---------------------------------------
+# Note that all defined, but not necessarily present, levels of the
+# factor variable `size` are retained.
+fruits \%>\% expand(type)
+fruits \%>\% expand(type, size)
+fruits \%>\% expand(type, size, year)
+
+# Other uses -------------------------------------------------------
+fruits \%>\% expand(type, size, 2010:2012)
+
+# Use `anti_join()` to determine which observations are missing
+all <- fruits \%>\% expand(type, size, year)
+all
+all \%>\% dplyr::anti_join(fruits)
+
+# Use with `right_join()` to fill in missing rows
+fruits \%>\% dplyr::right_join(all)
+}
diff --git a/tests/testthat/test-complete.R b/tests/testthat/test-complete.R
new file mode 100644
index 000000000..1e4718e80
--- /dev/null
+++ b/tests/testthat/test-complete.R
@@ -0,0 +1,25 @@
+test_that("complete with no variables returns data as is", {
+  mtcars_dt <- lazy_dt(mtcars, "DT")
+  expect_equal(complete(mtcars_dt), mtcars_dt)
+})
+
+test_that("basic invocation works", {
+  tbl <- tibble(x = 1:2, y = 1:2, z = 3:4)
+  dt <- lazy_dt(tbl, "DT")
+  out <- dt %>% complete(x, y) %>% collect()
+
+  expect_equal(nrow(out), 4)
+  expect_equal(out$z, c(3, NA, NA, 4))
+})
+
+test_that("empty expansion returns original", {
+  tbl <- tibble(x = character())
+  dt <- lazy_dt(tbl, "DT")
+  out <- dt %>% complete(y = NULL) %>% collect()
+  expect_equal(out, tbl)
+
+  tbl <- tibble(x = 1:4)
+  dt <- lazy_dt(tbl, "DT")
+  out <- dt %>% complete(y = NULL) %>% collect()
+  expect_equal(out, tbl)
+})
diff --git a/tests/testthat/test-step-subset-expand.R b/tests/testthat/test-step-subset-expand.R
new file mode 100644
index 000000000..69086da5c
--- /dev/null
+++ b/tests/testthat/test-step-subset-expand.R
@@ -0,0 +1,85 @@
+test_that("expand completes all values", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, y)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(x, y)])[CJ(x, y, unique = TRUE), on = .(x, y)])
+  )
+  expect_equal(step$vars, c("x", "y"))
+  expect_equal(nrow(out), 4)
+})
+
+test_that("multiple variables in one arg doesn't expand", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, c(x, y))
+  out <- collect(step)
+
+  expect_equal(nrow(out), 2)
+})
+
+test_that("works with unnamed vectors", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, 1:2)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(x = x, V2 = 1:2)])[CJ(x, V2, unique = TRUE), on = .(x, V2)])
+  )
+  expect_equal(step$vars, c("x", "V2"))
+  expect_equal(nrow(out), 4)
+})
+
+test_that("works with named vectors", {
+  tbl <- tibble(x = 1:2, y = 1:2)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, val = 1:2)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(x = x, val = 1:2)])[CJ(x, val, unique = TRUE), on = .(x, val)])
+  )
+  expect_equal(step$vars, c("x", "val"))
+  expect_equal(nrow(out), 4)
+})
+
+test_that("expand respects groups", {
+  tbl <- tibble(
+    a = c(1L, 1L, 2L),
+    b = c(1L, 2L, 1L),
+    c = c(2L, 1L, 1L)
+  )
+  dt <- lazy_dt(tbl, "DT")
+  step <- dt %>% group_by(c) %>% expand(a, b)
+  out <- collect(step)
+
+  expect_equal(
+    show_query(step),
+    expr(unique(DT[, .(c, a, b)])[, .SD[CJ(a, b, unique = TRUE), on = .(a, b)], keyby = .(c)])
+  )
+  expect_equal(step$vars, c("c", "a", "b"))
+  expect_equal(out$a, c(1, 1, 2, 2, 1))
+  expect_equal(out$b, c(1, 2, 1, 2, 1))
+})
+
+test_that("NULL inputs", {
+  tbl <- tibble(x = 1:5)
+  dt <- lazy_dt(tbl, "DT")
+  step <- expand(dt, x, y = NULL)
+  out <- collect(step)
+  expect_equal(out, tbl)
+})
+
+test_that("expand respects .name_repair", {
+  dt <- lazy_dt(tibble(x = 1:2), "DT")
+
+  suppressMessages(
+    expect_named(dt %>% expand(x, x, .name_repair = "unique") %>% collect(), c("x...1", "x...2"))
+  )
+})