From 3360f20738adde68195ca59f3c824bdeaed32a52 Mon Sep 17 00:00:00 2001 From: Mark Fairbanks Date: Fri, 19 Mar 2021 15:29:10 -0700 Subject: [PATCH] Implement expand and complete (#225) Also bump minimum data.table version to 1.13.0 --- DESCRIPTION | 2 +- NEWS.md | 8 ++- R/complete.R | 38 +++++++++++ R/step-join.R | 3 +- R/step-subset-expand.R | 78 ++++++++++++++++++++++ R/step-subset-slice.R | 4 +- R/step-subset.R | 8 ++- R/tidyeval.R | 2 +- R/zzz.R | 4 ++ man/complete.dtplyr_step.Rd | 52 +++++++++++++++ man/expand.dtplyr_step.Rd | 84 +++++++++++++++++++++++ tests/testthat/test-complete.R | 25 +++++++ tests/testthat/test-step-subset-expand.R | 85 ++++++++++++++++++++++++ 13 files changed, 383 insertions(+), 10 deletions(-) create mode 100644 R/complete.R create mode 100644 R/step-subset-expand.R create mode 100644 man/complete.dtplyr_step.Rd create mode 100644 man/expand.dtplyr_step.Rd create mode 100644 tests/testthat/test-complete.R create mode 100644 tests/testthat/test-step-subset-expand.R diff --git a/DESCRIPTION b/DESCRIPTION index 738aa6076..94439d37b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ Depends: R (>= 3.3) Imports: crayon, - data.table (>= 1.12.4), + data.table (>= 1.13.0), dplyr (>= 1.0.3), ellipsis, glue, diff --git a/NEWS.md b/NEWS.md index 3e66b9f14..e9fddd732 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,10 +22,14 @@ * `.data` and `.env` pronouns now work inside of `if_else()` (@markfairbanks, #220). -* More tidyr verbs are in the process of being added: - +* More translations for tidyr verbs have been added: + * `drop_na()` (@markfairbanks, #194) + * `complete()` (@markfairbanks, #225) + + * `expand()` (@markfairbanks, #225) + * `fill()` (@markfairbanks, #197) * `pivot_longer()` (@markfairbanks, #204) diff --git a/R/complete.R b/R/complete.R new file mode 100644 index 000000000..668b9b7d3 --- /dev/null +++ b/R/complete.R @@ -0,0 +1,38 @@ +#' Complete a data frame with missing combinations of data +#' +#' @description +#' This is a method for the tidyr `complete()` generic. This is a wrapper +#' around `dtplyr` translations for `expand()`, `full_join()`, and `replace_na()` +#' that's useful for completing missing combinations of data. +#' +#' @param data A [lazy_dt()]. +#' @inheritParams tidyr::complete +#' @examples +#' library(tidyr) +#' tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) +#' dt <- lazy_dt(tbl) +#' +#' dt %>% +#' complete(x, y) +#' +#' dt %>% +#' complete(x, y, fill = list(z = 10L)) +# exported onLoad +complete.dtplyr_step <- function(data, ..., fill = list()) { + dots <- enquos(...) + dots <- dots[!vapply(dots, quo_is_null, logical(1))] + if (length(dots) == 0) { + return(data) + } + + full <- tidyr::expand(data, !!!dots) + full <- dplyr::full_join(full, data, by = full$vars) + full <- tidyr::replace_na(full, replace = fill) + full +} + +# exported onLoad +complete.data.table <- function(data, ..., fill = list()) { + data <- lazy_dt(data) + tidyr::complete(data, ..., fill = fill) +} diff --git a/R/step-join.R b/R/step-join.R index f8d0379da..9da8944ac 100644 --- a/R/step-join.R +++ b/R/step-join.R @@ -128,7 +128,8 @@ step_subset_on <- function(x, y, i, on) { vars = union(x$vars, y$vars), i = y, on = on, - locals = utils::modifyList(x$locals, y$locals) + locals = utils::modifyList(x$locals, y$locals), + allow_cartesian = TRUE ) } diff --git a/R/step-subset-expand.R b/R/step-subset-expand.R new file mode 100644 index 000000000..fefa753fa --- /dev/null +++ b/R/step-subset-expand.R @@ -0,0 +1,78 @@ +#' Expand data frame to include all possible combinations of values. +#' +#' @description +#' This is a method for the tidyr `expand()` generic. It is translated to +#' [data.table::CJ()]. +#' +#' @param data A [lazy_dt()]. +#' @inheritParams tidyr::expand +#' @examples +#' library(tidyr) +#' +#' fruits <- lazy_dt(tibble( +#' type = c("apple", "orange", "apple", "orange", "orange", "orange"), +#' year = c(2010, 2010, 2012, 2010, 2010, 2012), +#' size = factor( +#' c("XS", "S", "M", "S", "S", "M"), +#' levels = c("XS", "S", "M", "L") +#' ), +#' weights = rnorm(6, as.numeric(size) + 2) +#' )) +#' +#' # All possible combinations --------------------------------------- +#' # Note that all defined, but not necessarily present, levels of the +#' # factor variable `size` are retained. +#' fruits %>% expand(type) +#' fruits %>% expand(type, size) +#' fruits %>% expand(type, size, year) +#' +#' # Other uses ------------------------------------------------------- +#' fruits %>% expand(type, size, 2010:2012) +#' +#' # Use `anti_join()` to determine which observations are missing +#' all <- fruits %>% expand(type, size, year) +#' all +#' all %>% dplyr::anti_join(fruits) +#' +#' # Use with `right_join()` to fill in missing rows +#' fruits %>% dplyr::right_join(all) +# exported onLoad +expand.dtplyr_step <- function(data, ..., .name_repair = "check_unique") { + dots <- capture_dots(data, ..., .j = FALSE) + dots <- dots[!vapply(dots, is_null, logical(1))] + if (length(dots) == 0) { + return(data) + } + + named_dots <- have_name(dots) + if (any(!named_dots)) { + # Auto-names generated by enquos() don't always work with the CJ() step + ## Ex: `1:3` + # Replicates the "V" naming convention data.table uses + symbol_dots <- vapply(dots, is_symbol, logical(1)) + needs_v_name <- !symbol_dots & !named_dots + v_names <- paste0("V", 1:length(dots)) + names(dots)[needs_v_name] <- v_names[needs_v_name] + names(dots)[symbol_dots] <- lapply(dots[symbol_dots], as_name) + } + names(dots) <- vctrs::vec_as_names(names(dots), repair = .name_repair) + + on <- names(dots) + cj <- expr(CJ(!!!syms(on), unique = TRUE)) + + out <- distinct(data, !!!syms(data$groups), !!!dots) + if (length(data$groups) == 0) { + out <- step_subset(out, i = cj, on = on) + } else { + on <- call2(".", !!!syms(on)) + out <- step_subset(out, j = expr(.SD[!!cj, on = !!on])) + } + + out +} + +# exported onLoad +expand.data.table <- function(data, ..., .name_repair = "check_unique") { + data <- lazy_dt(data) + tidyr::expand(data, ..., .name_repair = .name_repair) +} diff --git a/R/step-subset-slice.R b/R/step-subset-slice.R index a5b8cae20..35614c8b1 100644 --- a/R/step-subset-slice.R +++ b/R/step-subset-slice.R @@ -157,8 +157,8 @@ slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, with_ties = prop = expr(!!smaller_ranks(!!order_by, !!size$prop * .N, ties.method = ties.method)) ) - step_subset_i(.data, i) %>% - arrange(!!order_by, .by_group = TRUE) + out <- step_subset_i(.data, i) + arrange(out, !!order_by, .by_group = TRUE) } smaller_ranks <- function(x, y, ties.method = "min") { diff --git a/R/step-subset.R b/R/step-subset.R index 86bbefcb9..00ee8634d 100644 --- a/R/step-subset.R +++ b/R/step-subset.R @@ -5,8 +5,9 @@ step_subset <- function(parent, arrange = parent$arrange, i = NULL, j = NULL, - on = character() - ) { + on = character(), + allow_cartesian = NULL +) { stopifnot(is_step(parent)) stopifnot(is.null(i) || is_expression(i) || is_step(i)) @@ -22,6 +23,7 @@ step_subset <- function(parent, i = i, j = j, on = on, + allow_cartesian = allow_cartesian, implicit_copy = !is.null(i) || !is.null(j), class = "dtplyr_step_subset" ) @@ -117,7 +119,7 @@ dt_call.dtplyr_step_subset <- function(x, needs_copy = x$needs_copy) { if (length(x$on) > 0) { out$on <- call2(".", !!!syms(x$on)) - out$allow.cartesian <- TRUE + out$allow.cartesian <- x$allow_cartesian } out } diff --git a/R/tidyeval.R b/R/tidyeval.R index b97f29545..704064762 100644 --- a/R/tidyeval.R +++ b/R/tidyeval.R @@ -14,7 +14,7 @@ dt_eval <- function(x) { # Make sure data.table functions are available so dtplyr still works # even when data.table isn't attached dt_funs <- c( - "copy", "dcast", "melt", "nafill", + "CJ", "copy", "dcast", "melt", "nafill", "fcase", "fcoalesce", "fifelse", "fintersect", "frank", "frankv", "fsetdiff", "funion", "setcolorder", "setnames" ) diff --git a/R/zzz.R b/R/zzz.R index 0537891b0..82c810ca8 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -4,7 +4,9 @@ register_s3_method("dplyr", "intersect", "data.table") register_s3_method("dplyr", "setdiff", "data.table") register_s3_method("dplyr", "union", "data.table") + register_s3_method("tidyr", "complete", "data.table") register_s3_method("tidyr", "drop_na", "data.table") + register_s3_method("tidyr", "expand", "data.table") register_s3_method("tidyr", "fill", "data.table") register_s3_method("tidyr", "pivot_longer", "data.table") register_s3_method("tidyr", "pivot_wider", "data.table") @@ -14,7 +16,9 @@ register_s3_method("dplyr", "intersect", "dtplyr_step") register_s3_method("dplyr", "setdiff", "dtplyr_step") register_s3_method("dplyr", "union", "dtplyr_step") + register_s3_method("tidyr", "complete", "dtplyr_step") register_s3_method("tidyr", "drop_na", "dtplyr_step") + register_s3_method("tidyr", "expand", "dtplyr_step") register_s3_method("tidyr", "fill", "dtplyr_step") register_s3_method("tidyr", "pivot_longer", "dtplyr_step") register_s3_method("tidyr", "pivot_wider", "dtplyr_step") diff --git a/man/complete.dtplyr_step.Rd b/man/complete.dtplyr_step.Rd new file mode 100644 index 000000000..d53b93f6a --- /dev/null +++ b/man/complete.dtplyr_step.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/complete.R +\name{complete.dtplyr_step} +\alias{complete.dtplyr_step} +\title{Complete a data frame with missing combinations of data} +\usage{ +\method{complete}{dtplyr_step}(data, ..., fill = list()) +} +\arguments{ +\item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} + +\item{...}{Specification of columns to expand. Columns can be atomic vectors +or lists. +\itemize{ +\item To find all unique combinations of \code{x}, \code{y} and \code{z}, including those not +present in the data, supply each variable as a separate argument: +\code{expand(df, x, y, z)}. +\item To find only the combinations that occur in the +data, use \code{nesting}: \code{expand(df, nesting(x, y, z))}. +\item You can combine the two forms. For example, +\code{expand(df, nesting(school_id, student_id), date)} would produce +a row for each present school-student combination for all possible +dates. +} + +When used with factors, \code{expand()} uses the full set of levels, not just +those that appear in the data. If you want to use only the values seen in +the data, use \code{forcats::fct_drop()}. + +When used with continuous variables, you may need to fill in values +that do not appear in the data: to do so use expressions like +\code{year = 2010:2020} or \code{year = full_seq(year,1)}.} + +\item{fill}{A named list that for each variable supplies a single value to +use instead of \code{NA} for missing combinations.} +} +\description{ +This is a method for the tidyr \code{complete()} generic. This is a wrapper +around \code{dtplyr} translations for \code{expand()}, \code{full_join()}, and \code{replace_na()} +that's useful for completing missing combinations of data. +} +\examples{ +library(tidyr) +tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) +dt <- lazy_dt(tbl) + +dt \%>\% + complete(x, y) + +dt \%>\% + complete(x, y, fill = list(z = 10L)) +} diff --git a/man/expand.dtplyr_step.Rd b/man/expand.dtplyr_step.Rd new file mode 100644 index 000000000..31f4698c7 --- /dev/null +++ b/man/expand.dtplyr_step.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step-subset-expand.R +\name{expand.dtplyr_step} +\alias{expand.dtplyr_step} +\title{Expand data frame to include all possible combinations of values.} +\usage{ +\method{expand}{dtplyr_step}(data, ..., .name_repair = "check_unique") +} +\arguments{ +\item{data}{A \code{\link[=lazy_dt]{lazy_dt()}}.} + +\item{...}{Specification of columns to expand. Columns can be atomic vectors +or lists. +\itemize{ +\item To find all unique combinations of \code{x}, \code{y} and \code{z}, including those not +present in the data, supply each variable as a separate argument: +\code{expand(df, x, y, z)}. +\item To find only the combinations that occur in the +data, use \code{nesting}: \code{expand(df, nesting(x, y, z))}. +\item You can combine the two forms. For example, +\code{expand(df, nesting(school_id, student_id), date)} would produce +a row for each present school-student combination for all possible +dates. +} + +When used with factors, \code{expand()} uses the full set of levels, not just +those that appear in the data. If you want to use only the values seen in +the data, use \code{forcats::fct_drop()}. + +When used with continuous variables, you may need to fill in values +that do not appear in the data: to do so use expressions like +\code{year = 2010:2020} or \code{year = full_seq(year,1)}.} + +\item{.name_repair}{Treatment of problematic column names: +\itemize{ +\item \code{"minimal"}: No name repair or checks, beyond basic existence, +\item \code{"unique"}: Make sure names are unique and not empty, +\item \code{"check_unique"}: (default value), no name repair, but check they are +\code{unique}, +\item \code{"universal"}: Make the names \code{unique} and syntactic +\item a function: apply custom name repair (e.g., \code{.name_repair = make.names} +for names in the style of base R). +\item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}} +} + +This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}. +See there for more details on these terms and the strategies used +to enforce them.} +} +\description{ +This is a method for the tidyr \code{expand()} generic. It is translated to +\code{\link[data.table:J]{data.table::CJ()}}. +} +\examples{ +library(tidyr) + +fruits <- lazy_dt(tibble( + type = c("apple", "orange", "apple", "orange", "orange", "orange"), + year = c(2010, 2010, 2012, 2010, 2010, 2012), + size = factor( + c("XS", "S", "M", "S", "S", "M"), + levels = c("XS", "S", "M", "L") + ), + weights = rnorm(6, as.numeric(size) + 2) +)) + +# All possible combinations --------------------------------------- +# Note that all defined, but not necessarily present, levels of the +# factor variable `size` are retained. +fruits \%>\% expand(type) +fruits \%>\% expand(type, size) +fruits \%>\% expand(type, size, year) + +# Other uses ------------------------------------------------------- +fruits \%>\% expand(type, size, 2010:2012) + +# Use `anti_join()` to determine which observations are missing +all <- fruits \%>\% expand(type, size, year) +all +all \%>\% dplyr::anti_join(fruits) + +# Use with `right_join()` to fill in missing rows +fruits \%>\% dplyr::right_join(all) +} diff --git a/tests/testthat/test-complete.R b/tests/testthat/test-complete.R new file mode 100644 index 000000000..1e4718e80 --- /dev/null +++ b/tests/testthat/test-complete.R @@ -0,0 +1,25 @@ +test_that("complete with no variables returns data as is", { + mtcars_dt <- lazy_dt(mtcars, "DT") + expect_equal(complete(mtcars_dt), mtcars_dt) +}) + +test_that("basic invocation works", { + tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) + dt <- lazy_dt(tbl, "DT") + out <- dt %>% complete(x, y) %>% collect() + + expect_equal(nrow(out), 4) + expect_equal(out$z, c(3, NA, NA, 4)) +}) + +test_that("empty expansion returns original", { + tbl <- tibble(x = character()) + dt <- lazy_dt(tbl, "DT") + out <- dt %>% complete(y = NULL) %>% collect() + expect_equal(out, tbl) + + tbl <- tibble(x = 1:4) + dt <- lazy_dt(tbl, "DT") + out <- dt %>% complete(y = NULL) %>% collect() + expect_equal(out, tbl) +}) diff --git a/tests/testthat/test-step-subset-expand.R b/tests/testthat/test-step-subset-expand.R new file mode 100644 index 000000000..69086da5c --- /dev/null +++ b/tests/testthat/test-step-subset-expand.R @@ -0,0 +1,85 @@ +test_that("expand completes all values", { + tbl <- tibble(x = 1:2, y = 1:2) + dt <- lazy_dt(tbl, "DT") + step <- expand(dt, x, y) + out <- collect(step) + + expect_equal( + show_query(step), + expr(unique(DT[, .(x, y)])[CJ(x, y, unique = TRUE), on = .(x, y)]) + ) + expect_equal(step$vars, c("x", "y")) + expect_equal(nrow(out), 4) +}) + +test_that("multiple variables in one arg doesn't expand", { + tbl <- tibble(x = 1:2, y = 1:2) + dt <- lazy_dt(tbl, "DT") + step <- expand(dt, c(x, y)) + out <- collect(step) + + expect_equal(nrow(out), 2) +}) + +test_that("works with unnamed vectors", { + tbl <- tibble(x = 1:2, y = 1:2) + dt <- lazy_dt(tbl, "DT") + step <- expand(dt, x, 1:2) + out <- collect(step) + + expect_equal( + show_query(step), + expr(unique(DT[, .(x = x, V2 = 1:2)])[CJ(x, V2, unique = TRUE), on = .(x, V2)]) + ) + expect_equal(step$vars, c("x", "V2")) + expect_equal(nrow(out), 4) +}) + +test_that("works with named vectors", { + tbl <- tibble(x = 1:2, y = 1:2) + dt <- lazy_dt(tbl, "DT") + step <- expand(dt, x, val = 1:2) + out <- collect(step) + + expect_equal( + show_query(step), + expr(unique(DT[, .(x = x, val = 1:2)])[CJ(x, val, unique = TRUE), on = .(x, val)]) + ) + expect_equal(step$vars, c("x", "val")) + expect_equal(nrow(out), 4) +}) + +test_that("expand respects groups", { + tbl <- tibble( + a = c(1L, 1L, 2L), + b = c(1L, 2L, 1L), + c = c(2L, 1L, 1L) + ) + dt <- lazy_dt(tbl, "DT") + step <- dt %>% group_by(c) %>% expand(a, b) + out <- collect(step) + + expect_equal( + show_query(step), + expr(unique(DT[, .(c, a, b)])[, .SD[CJ(a, b, unique = TRUE), on = .(a, b)], keyby = .(c)]) + ) + expect_equal(step$vars, c("c", "a", "b")) + expect_equal(out$a, c(1, 1, 2, 2, 1)) + expect_equal(out$b, c(1, 2, 1, 2, 1)) +}) + +test_that("NULL inputs", { + tbl <- tibble(x = 1:5) + dt <- lazy_dt(tbl, "DT") + step <- expand(dt, x, y = NULL) + out <- collect(step) + expect_equal(out, tbl) +}) + +test_that("expand respects .name_repair", { + dt <- lazy_dt(tibble(x = 1:2), "DT") + + suppressMessages( + expect_named(dt %>% expand(x, x, .name_repair = "unique") %>% collect(), c("x...1", "x...2")) + ) +})