From 5742d790652fc040b30e109cdb4bbb894c622a76 Mon Sep 17 00:00:00 2001 From: 2005m Date: Thu, 19 Dec 2019 03:15:07 +0000 Subject: [PATCH] fcase / case_when function for data.table (#4021) --- NAMESPACE | 1 + NEWS.md | 48 +++++++++++ R/wrappers.R | 1 + inst/tests/tests.Rraw | 102 ++++++++++++++++++++++ man/fcase.Rd | 58 +++++++++++++ src/data.table.h | 4 + src/fifelse.c | 195 ++++++++++++++++++++++++++++++++++++++++++ src/init.c | 2 + 8 files changed, 411 insertions(+) create mode 100644 man/fcase.Rd diff --git a/NAMESPACE b/NAMESPACE index 7689afe38..2112878f3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(setNumericRounding, getNumericRounding) export(chmatch, "%chin%", chorder, chgroup) export(rbindlist) export(fifelse) +export(fcase) export(fread) export(fwrite) export(foverlaps) diff --git a/NEWS.md b/NEWS.md index 8f17827eb..f02e5664e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,54 @@ 5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. +6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details. + +```R +# Lazy evaluation +x = 1:10 +data.table::fcase( + x < 5L, 1L, + x >= 5L, 3L, + x == 5L, stop("provided value is an unexpected one!") +) +# [1] 1 1 1 1 3 3 3 3 3 3 + +dplyr::case_when( + x < 5L ~ 1L, + x >= 5L ~ 3L, + x == 5L ~ stop("provided value is an unexpected one!") +) +# Error in eval_tidy(pair$rhs, env = default_env) : +# provided value is an unexpected one! + +# Benchmark +x = sample(1:100, 3e7, replace = TRUE) # 114 MB +microbenchmark::microbenchmark( +dplyr::case_when( + x < 10L ~ 0L, + x < 20L ~ 10L, + x < 30L ~ 20L, + x < 40L ~ 30L, + x < 50L ~ 40L, + x < 60L ~ 50L, + x > 60L ~ 60L +), +data.table::fcase( + x < 10L, 0L, + x < 20L, 10L, + x < 30L, 20L, + x < 40L, 30L, + x < 50L, 40L, + x < 60L, 50L, + x > 60L, 60L +), +times = 5L, +unit = "s") +# Unit: seconds +# expr min lq mean median uq max neval +# dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5 +# data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5 +``` ## BUG FIXES diff --git a/R/wrappers.R b/R/wrappers.R index d1ca37970..5fec33a92 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -6,6 +6,7 @@ fcoalesce = function(...) .Call(Ccoalesce, list(...), FALSE) setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) +fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) coerceFill = function(x) .Call(CcoerceFillR, x) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b1a6feaa6..e851393a1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16562,6 +16562,108 @@ DT = data.table(A="a", key="A") test(2126.1, DT[J(NULL)], DT[0]) test(2126.2, DT[data.table()], DT[0]) +# fcase, #3823 +test_vec1 = -5L:5L < 0L +test_vec2 = -5L:5L > 0L +test_vec3 = -5L:5L < 5L +test_vec_na1 = c(test_vec1, NA) +test_vec_na2 = c(test_vec2, NA) +out_vec = c(1,1,1,1,1,NA,0,0,0,0,0) +out_vec_def = c(1,1,1,1,1,2,0,0,0,0,0) +out_vec_na= c(1,1,1,1,1,NA,0,0,0,0,0,NA) +out_vec_oc= c(1,1,1,1,1,NA,NA,NA,NA,NA,NA) +test(2127.01, fcase(test_vec1, 1L, test_vec2, 0L), as.integer(out_vec)) +test(2127.02, fcase(test_vec1, 1, test_vec2, 0), out_vec) +test(2127.03, fcase(test_vec1, "1", test_vec2, "0"), as.character(out_vec)) +test(2127.04, fcase(test_vec1, TRUE, test_vec2, FALSE), as.logical(out_vec)) +test(2127.05, fcase(test_vec1, 1+0i, test_vec2, 0+0i), as.complex(out_vec)) +test(2127.06, fcase(test_vec1, list(1), test_vec2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0)) +test(2127.07, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14")), c(rep(as.Date("2019-10-11"),5),NA,rep(as.Date("2019-10-14"),5))) +test(2127.08, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3])), factor(c(rep("a",5),NA,rep("b",5)), levels=letters[1:3])) +test(2127.09, fcase(test_vec1, 1L, test_vec2, 0L, default=2L), as.integer(out_vec_def)) +test(2127.10, fcase(test_vec1, 1, test_vec2, 0,default=2), out_vec_def) +test(2127.11, fcase(test_vec1, "1", test_vec2, "0", default ="2"), as.character(out_vec_def)) +test(2127.12, fcase(test_vec1, TRUE, test_vec2, FALSE, default=TRUE), as.logical(out_vec_def)) +test(2127.13, fcase(test_vec1, 1+0i, test_vec2, 0+0i, default=2+0i), as.complex(out_vec_def)) +test(2127.14, fcase(test_vec1, list(1), test_vec2, list(0),default=list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0)) +test(2127.15, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5))) +test(2127.16, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]),default=factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3])) +test(2127.17, fcase(test_vec1, as.raw(1), test_vec2, as.raw(0)), error="Type raw is not supported.") +test(2127.18, fcase(test_vec1, factor("a", levels=letters[1]), test_vec2, factor("b", levels=letters[1:3])), error="Argument #2 and argument #4 are both factor but their levels are different.") +test(2127.19, fcase(test_vec1, factor("a", levels=letters[1:2]), test_vec2, factor("b", levels=letters[1:2]),default=factor("c", levels=letters[1:3])), error="Resulting value and 'default' are both type factor but their levels are different.") +test(2127.20, fcase(test_vec1, 1L:10L, test_vec2, 3L:12L, test_vec2), error="Please supply an even number of arguments in ..., consisting of logical condition, resulting value pairs (in that order); received 5 inputs.") +test(2127.21, fcase(test_vec1, 1L, test_vec2, 3), error="Argument #4 is of type double, however argument #2 is of type integer. Please make sure all output values have the same type.") +test(2127.22, fcase(test_vec1, "FALSE", test_vec2, TRUE), error="Argument #4 is of type logical, however argument #2 is of type character. Please make sure all output values have the same type.") +test(2127.23, fcase(test_vec1, "FALSE", test_vec2, 5L), error="Argument #4 is of type integer, however argument #2 is of type character. Please make sure all output values have the same type.") +test(2127.24, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default="2019-10-15"), error="Resulting value is of type double but 'default' is of type character. Please make sure that both arguments have the same type.") +test(2127.25, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=123), error="Resulting value has different class than 'default'. Please make sure that both arguments have the same class.") +if(test_bit64) { + i=as.integer64(1:12)+3e9 + test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:12L]+100)) +} +if(test_nanotime) { + n=nanotime(1:12) + test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:12L]+100)) +} +test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec)) +test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec) +test(2127.30, fcase(test_vec1, rep("1",11L), test_vec2, rep("0",11L)), as.character(out_vec)) +test(2127.31, fcase(test_vec1, rep(TRUE,11L), test_vec2, rep(FALSE,11L)), as.logical(out_vec)) +test(2127.32, fcase(test_vec1, rep(1+0i,11L), test_vec2, rep(0+0i,11L)), as.complex(out_vec)) +test(2127.33, fcase(test_vec1, rep(list(1),11L), test_vec2, rep(list(0),11L)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0)) +test(2127.34, fcase(test_vec1, rep(as.Date("2019-10-11"),11L), test_vec2, rep(as.Date("2019-10-14"),11L)), c(rep(as.Date("2019-10-11"),5),NA,rep(as.Date("2019-10-14"),5))) +test(2127.35, fcase(test_vec1, rep(factor("a", levels=letters[1:3]),11L), test_vec2, rep(factor("b", levels=letters[1:3]),11L)), factor(c(rep("a",5),NA,rep("b",5)), levels=letters[1:3])) +test(2127.36, fcase(test_vec_na1, 1L, test_vec_na2, 0L), as.integer(out_vec_na)) +test(2127.37, fcase(test_vec_na1, 1, test_vec_na2, 0), out_vec_na) +test(2127.38, fcase(test_vec_na1, "1", test_vec_na2, "0"), as.character(out_vec_na)) +test(2127.39, fcase(test_vec_na1, TRUE, test_vec_na2, FALSE), as.logical(out_vec_na)) +test(2127.40, fcase(test_vec_na1, 1+0i, test_vec_na2, 0+0i), as.complex(out_vec_na)) +test(2127.41, fcase(test_vec_na1, list(1), test_vec_na2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0,NULL)) +test(2127.42, fcase(c(TRUE,TRUE,TRUE,FALSE,FALSE),factor(NA,levels=letters[1:5]),c(FALSE,FALSE,FALSE,TRUE,TRUE),factor(letters[1:5])),factor(c(NA,NA,NA,"d","e"),levels=letters[1:5])) +test(2127.43, fcase(c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(NA,levels=letters[1:6]),c(FALSE,FALSE,FALSE,TRUE,NA,TRUE),factor(letters[1:6])),factor(c(NA,NA,NA,"d",NA,"f"),levels=letters[1:6])) +test(2127.44, fcase(c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(letters[1:6]),c(FALSE,FALSE,FALSE,TRUE,NA,TRUE),factor(NA,levels = letters[1:6])),factor(c("a","b","c",NA,NA,NA),levels=letters[1:6])) +test(2127.45, fcase(c(TRUE,NA,TRUE,FALSE,FALSE,FALSE),factor(NA),c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(NA)),factor(c(NA,NA,NA,NA,NA,NA))) +test(2127.46, fcase(TRUE, list(data.table(1:5)), FALSE, list(data.table(5:1))), list(data.table(1:5))) +test(2127.47, fcase(FALSE, list(data.table(1:5)), TRUE, list(data.table(5:1))), list(data.table(5:1))) +test(2127.48, fcase(TRUE, list(data.frame(1:5)), FALSE, list(data.frame(5:1))), list(data.frame(1:5))) +test(2127.49, fcase(FALSE, list(data.frame(1:5)), TRUE, list(data.frame(5:1))), list(data.frame(5:1))) +test(2127.50, fcase(1L,1L,TRUE,0L), error = "Argument #1 must be logical.") +test(2127.51, fcase(TRUE,1L,5L,0L), 1L) +test(2127.52, fcase(test_vec1, 1L, test_vec2, 0L, test_vec3, 2L), as.integer(out_vec_def)) +test(2127.53, fcase(test_vec1, 1, test_vec2, 0, test_vec3, 2), out_vec_def) +test(2127.54, fcase(test_vec1, "1", test_vec2, "0", test_vec3, "2"), as.character(out_vec_def)) +test(2127.55, fcase(test_vec1, TRUE, test_vec2, FALSE, test_vec3, TRUE), as.logical(out_vec_def)) +test(2127.56, fcase(test_vec1, 1+0i, test_vec2, 0+0i, test_vec3, 2+0i), as.complex(out_vec_def)) +test(2127.57, fcase(test_vec1, list(1), test_vec2, list(0), test_vec3, list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0)) +test(2127.58, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"), test_vec3, as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5))) +test(2127.59, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]), test_vec3, factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3])) +test(2127.60, fcase(test_vec1, 1L), as.integer(out_vec_oc)) +test(2127.61, fcase(test_vec1, 1), out_vec_oc) +test(2127.62, fcase(test_vec1, "1"), as.character(out_vec_oc)) +test(2127.63, fcase(test_vec1, TRUE), as.logical(out_vec_oc)) +test(2127.64, fcase(test_vec1, 1+0i), as.complex(out_vec_oc)) +test(2127.65, fcase(test_vec1, list(1)), list(1,1,1,1,1, NULL, NULL, NULL, NULL, NULL, NULL)) +test(2127.66, fcase(test_vec1, as.Date("2019-10-11")), c(rep(as.Date("2019-10-11"),5),rep(NA,6))) +test(2127.67, fcase(test_vec1, factor("a", levels=letters[1:3])), factor(c(rep("a",5),rep("NA",6)), levels=letters[1:3])) +test(2127.68, fcase(test_vec1, 1L, default = 1:2), error = "Length of 'default' must be 1.") +test(2127.69, fcase(test_vec1, 1L, test_vec_na1, 2L), error = "Argument #3 has a different length than argument #1. Please make sure all logical conditions have the same length.") +test(2127.70, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, 2), error = "Argument #4 has different class than argument #2, Please make sure all output values have the same class.") +test(2127.71, fcase(test_vec1, 1L, test_vec2, 2:3), error = "Length of output value #4 must either be 1 or length of logical condition.") +test(2127.72, fcase(TRUE, 1L, FALSE, stop("bang!")), 1L) +test(2127.73, fcase(test_vec1, 1L, test_vec2, 0:10), as.integer(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10))) +test(2127.74, fcase(test_vec1, 0:10, test_vec2, 0L), as.integer(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0))) +test(2127.75, fcase(test_vec1, 1, test_vec2, as.numeric(0:10)), as.numeric(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10))) +test(2127.76, fcase(test_vec1, as.numeric(0:10), test_vec2, 0), as.numeric(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0))) +test(2127.77, fcase(test_vec1, "1", test_vec2, as.character(0:10)), as.character(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10))) +test(2127.78, fcase(test_vec1, as.character(0:10), test_vec2, "0"), as.character(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0))) +test(2127.79, fcase(test_vec1, TRUE, test_vec2, rep(FALSE, 11L)), as.logical(out_vec)) +test(2127.80, fcase(test_vec1, rep(TRUE, 11L), test_vec2, FALSE), as.logical(out_vec)) +test(2127.81, fcase(test_vec1, 1+0i, test_vec2, rep(0+0i, 11L)), as.complex(out_vec)) +test(2127.82, fcase(test_vec1, rep(1+0i, 11L), test_vec2, 0+0i), as.complex(out_vec)) +test(2127.83, fcase(test_vec1, list(rep(1, 11L)), test_vec2, list(0)), list(rep(1, 11L),rep(1, 11L),rep(1, 11L),rep(1, 11L),rep(1, 11L), NULL, 0, 0, 0, 0, 0)) +test(2127.84, fcase(test_vec1, list(1), test_vec2, list(rep(0,11L))), list(1,1,1,1,1, NULL, rep(0,11L), rep(0,11L), rep(0,11L), rep(0,11L), rep(0,11L))) +test(2127.85, fcase(test_vec1, list(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), test_vec2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0)) +test(2127.86, fcase(test_vec1, list(1), test_vec2, list(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0)) ################################### # Add new tests above this line # diff --git a/man/fcase.Rd b/man/fcase.Rd new file mode 100644 index 000000000..82e582ca4 --- /dev/null +++ b/man/fcase.Rd @@ -0,0 +1,58 @@ +\name{fcase} +\alias{fcase} +\title{fcase} +\description{ +\code{fcase} is a fast implementation of SQL \code{CASE WHEN} statement for R. Conceptually, \code{fcase} is a nested version of \code{\link{fifelse}} (with smarter implementation than manual nesting). It is comparable to \code{dplyr::case_when} and supports \code{bit64}'s \code{integer64} and \code{nanotime} classes. +} +\usage{ + fcase(..., default=NA) +} +\arguments{ +\item{...}{ A sequence consisting of logical condition (\code{when})-resulting value (\code{value}) \emph{pairs} in the following order \code{when1, value1, when2, value2, ..., whenN, valueN}. Logical conditions \code{when1, when2, ..., whenN} must all have the same length, type and attributes. Each \code{value} may either share length with \code{when} or be length 1. Please see Examples section for further details.} +\item{default}{ Default return value, \code{NA} by default, for when all of the logical conditions \code{when1, when2, ..., whenN} are \code{FALSE} or missing for some entries. } +} +\value{ + Vector with the same length as the logical conditions (\code{when}) in \code{...}, filled with the corresponding values (\code{value}) from \code{...}, or eventually \code{default}. Attributes of output values \code{value1, value2, ...valueN} in \code{...} are preserved. +} +\seealso{ + \code{\link{fifelse}} +} +\examples{ +x = 1:10 +fcase( + x < 5L, 1L, + x > 5L, 3L +) + +fcase( + x < 5L, 1L:10L, + x > 5L, 3L:12L +) + +# Lazy evaluation example +fcase( + x < 5L, 1L, + x >= 5L, 3L, + x == 5L, stop("provided value is an unexpected one!") +) + +# fcase preserves attributes, example with dates +fcase( + x < 5L, as.Date("2019-10-11"), + x > 5L, as.Date("2019-10-14") +) + +# fcase example with factor; note the matching levels +fcase( + x < 5L, factor("a", levels=letters[1:3]), + x > 5L, factor("b", levels=letters[1:3]) +) + +# Example of using the 'default' argument +fcase( + x < 5L, 1L, + x > 5L, 3L, + default = 5L +) +} +\keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index d972a2395..f0d5b2a16 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -234,3 +234,7 @@ SEXP coerceUtf8IfNeeded(SEXP x); char *end(char *start); void ansMsg(ans_t *ans, int n, bool verbose, const char *func); SEXP testMsgR(SEXP status, SEXP x, SEXP k); + +//fifelse.c +SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); +SEXP fcaseR(SEXP na, SEXP rho, SEXP args); diff --git a/src/fifelse.c b/src/fifelse.c index de53c9bd6..69f7b46b1 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -140,3 +140,198 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) { UNPROTECT(nprotect); return ans; } + +SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { + int n=length(args); + if (n % 2) { + error("Please supply an even number of arguments in ..., consisting of logical condition," + " resulting value pairs (in that order); received %d inputs.", n); + } + int nprotect = 0, l = 0; + int64_t len0=0, len1=0, len2=0, idx=0; + SEXP ans = R_NilValue, value0 = R_NilValue, tracker = R_NilValue, cons = R_NilValue, outs = R_NilValue; + PROTECT_INDEX Icons, Iouts; + PROTECT_WITH_INDEX(cons, &Icons); nprotect++; + PROTECT_WITH_INDEX(outs, &Iouts); nprotect++; + SEXPTYPE type0; + bool nonna = !isNull(na), imask = true; + int *restrict p = NULL; + n = n/2; + for (int i=0; i1 ? INT64_MAX : 0; + switch(TYPEOF(outs)) { + case LGLSXP: { + const int *restrict pouts = LOGICAL(outs); + int *restrict pans = LOGICAL(ans); + const int pna = nonna ? LOGICAL(na)[0] : NA_LOGICAL; + for (int64_t j=0; j