Skip to content

Commit

Permalink
fcase / case_when function for data.table (#4021)
Browse files Browse the repository at this point in the history
  • Loading branch information
2005m authored and mattdowle committed Dec 19, 2019
1 parent f6519b7 commit 5742d79
Show file tree
Hide file tree
Showing 8 changed files with 411 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export(setNumericRounding, getNumericRounding)
export(chmatch, "%chin%", chorder, chgroup)
export(rbindlist)
export(fifelse)
export(fcase)
export(fread)
export(fwrite)
export(foverlaps)
Expand Down
48 changes: 48 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,54 @@

5. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed.

6. New function `fcase(...,default)` implemented in C by Morgan Jacob, [#3823](https://github.com/Rdatatable/data.table/issues/3823), is inspired by SQL `CASE WHEN` which is a common tool in SQL for e.g. building labels or cutting age groups based on conditions. `fcase` is comparable to R function `dplyr::case_when` however it evaluates its arguments in a lazy way (i.e. only when needed) as shown below. Please see `?fcase` for more details.

```R
# Lazy evaluation
x = 1:10
data.table::fcase(
x < 5L, 1L,
x >= 5L, 3L,
x == 5L, stop("provided value is an unexpected one!")
)
# [1] 1 1 1 1 3 3 3 3 3 3

dplyr::case_when(
x < 5L ~ 1L,
x >= 5L ~ 3L,
x == 5L ~ stop("provided value is an unexpected one!")
)
# Error in eval_tidy(pair$rhs, env = default_env) :
# provided value is an unexpected one!

# Benchmark
x = sample(1:100, 3e7, replace = TRUE) # 114 MB
microbenchmark::microbenchmark(
dplyr::case_when(
x < 10L ~ 0L,
x < 20L ~ 10L,
x < 30L ~ 20L,
x < 40L ~ 30L,
x < 50L ~ 40L,
x < 60L ~ 50L,
x > 60L ~ 60L
),
data.table::fcase(
x < 10L, 0L,
x < 20L, 10L,
x < 30L, 20L,
x < 40L, 30L,
x < 50L, 40L,
x < 60L, 50L,
x > 60L, 60L
),
times = 5L,
unit = "s")
# Unit: seconds
# expr min lq mean median uq max neval
# dplyr::case_when 11.57 11.71 12.22 11.82 12.00 14.02 5
# data.table::fcase 1.49 1.55 1.67 1.71 1.73 1.86 5
```

## BUG FIXES

Expand Down
1 change: 1 addition & 0 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ fcoalesce = function(...) .Call(Ccoalesce, list(...), FALSE)
setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE)

fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na)
fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L])

colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups)
coerceFill = function(x) .Call(CcoerceFillR, x)
Expand Down
102 changes: 102 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -16562,6 +16562,108 @@ DT = data.table(A="a", key="A")
test(2126.1, DT[J(NULL)], DT[0])
test(2126.2, DT[data.table()], DT[0])

# fcase, #3823
test_vec1 = -5L:5L < 0L
test_vec2 = -5L:5L > 0L
test_vec3 = -5L:5L < 5L
test_vec_na1 = c(test_vec1, NA)
test_vec_na2 = c(test_vec2, NA)
out_vec = c(1,1,1,1,1,NA,0,0,0,0,0)
out_vec_def = c(1,1,1,1,1,2,0,0,0,0,0)
out_vec_na= c(1,1,1,1,1,NA,0,0,0,0,0,NA)
out_vec_oc= c(1,1,1,1,1,NA,NA,NA,NA,NA,NA)
test(2127.01, fcase(test_vec1, 1L, test_vec2, 0L), as.integer(out_vec))
test(2127.02, fcase(test_vec1, 1, test_vec2, 0), out_vec)
test(2127.03, fcase(test_vec1, "1", test_vec2, "0"), as.character(out_vec))
test(2127.04, fcase(test_vec1, TRUE, test_vec2, FALSE), as.logical(out_vec))
test(2127.05, fcase(test_vec1, 1+0i, test_vec2, 0+0i), as.complex(out_vec))
test(2127.06, fcase(test_vec1, list(1), test_vec2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
test(2127.07, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14")), c(rep(as.Date("2019-10-11"),5),NA,rep(as.Date("2019-10-14"),5)))
test(2127.08, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3])), factor(c(rep("a",5),NA,rep("b",5)), levels=letters[1:3]))
test(2127.09, fcase(test_vec1, 1L, test_vec2, 0L, default=2L), as.integer(out_vec_def))
test(2127.10, fcase(test_vec1, 1, test_vec2, 0,default=2), out_vec_def)
test(2127.11, fcase(test_vec1, "1", test_vec2, "0", default ="2"), as.character(out_vec_def))
test(2127.12, fcase(test_vec1, TRUE, test_vec2, FALSE, default=TRUE), as.logical(out_vec_def))
test(2127.13, fcase(test_vec1, 1+0i, test_vec2, 0+0i, default=2+0i), as.complex(out_vec_def))
test(2127.14, fcase(test_vec1, list(1), test_vec2, list(0),default=list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0))
test(2127.15, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5)))
test(2127.16, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]),default=factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3]))
test(2127.17, fcase(test_vec1, as.raw(1), test_vec2, as.raw(0)), error="Type raw is not supported.")
test(2127.18, fcase(test_vec1, factor("a", levels=letters[1]), test_vec2, factor("b", levels=letters[1:3])), error="Argument #2 and argument #4 are both factor but their levels are different.")
test(2127.19, fcase(test_vec1, factor("a", levels=letters[1:2]), test_vec2, factor("b", levels=letters[1:2]),default=factor("c", levels=letters[1:3])), error="Resulting value and 'default' are both type factor but their levels are different.")
test(2127.20, fcase(test_vec1, 1L:10L, test_vec2, 3L:12L, test_vec2), error="Please supply an even number of arguments in ..., consisting of logical condition, resulting value pairs (in that order); received 5 inputs.")
test(2127.21, fcase(test_vec1, 1L, test_vec2, 3), error="Argument #4 is of type double, however argument #2 is of type integer. Please make sure all output values have the same type.")
test(2127.22, fcase(test_vec1, "FALSE", test_vec2, TRUE), error="Argument #4 is of type logical, however argument #2 is of type character. Please make sure all output values have the same type.")
test(2127.23, fcase(test_vec1, "FALSE", test_vec2, 5L), error="Argument #4 is of type integer, however argument #2 is of type character. Please make sure all output values have the same type.")
test(2127.24, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default="2019-10-15"), error="Resulting value is of type double but 'default' is of type character. Please make sure that both arguments have the same type.")
test(2127.25, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"),default=123), error="Resulting value has different class than 'default'. Please make sure that both arguments have the same class.")
if(test_bit64) {
i=as.integer64(1:12)+3e9
test(2127.26, fcase(test_vec_na1, i, test_vec_na2, i+100), c(i[1L:5L], as.integer64(NA),i[7L:12L]+100))
}
if(test_nanotime) {
n=nanotime(1:12)
test(2127.27, fcase(test_vec_na1, n, test_vec_na2, n+100), c(n[1L:5L], nanotime(NA),n[7L:12L]+100))
}
test(2127.28, fcase(test_vec1, rep(1L,11L), test_vec2, rep(0L,11L)), as.integer(out_vec))
test(2127.29, fcase(test_vec1, rep(1,11L), test_vec2, rep(0,11L)), out_vec)
test(2127.30, fcase(test_vec1, rep("1",11L), test_vec2, rep("0",11L)), as.character(out_vec))
test(2127.31, fcase(test_vec1, rep(TRUE,11L), test_vec2, rep(FALSE,11L)), as.logical(out_vec))
test(2127.32, fcase(test_vec1, rep(1+0i,11L), test_vec2, rep(0+0i,11L)), as.complex(out_vec))
test(2127.33, fcase(test_vec1, rep(list(1),11L), test_vec2, rep(list(0),11L)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
test(2127.34, fcase(test_vec1, rep(as.Date("2019-10-11"),11L), test_vec2, rep(as.Date("2019-10-14"),11L)), c(rep(as.Date("2019-10-11"),5),NA,rep(as.Date("2019-10-14"),5)))
test(2127.35, fcase(test_vec1, rep(factor("a", levels=letters[1:3]),11L), test_vec2, rep(factor("b", levels=letters[1:3]),11L)), factor(c(rep("a",5),NA,rep("b",5)), levels=letters[1:3]))
test(2127.36, fcase(test_vec_na1, 1L, test_vec_na2, 0L), as.integer(out_vec_na))
test(2127.37, fcase(test_vec_na1, 1, test_vec_na2, 0), out_vec_na)
test(2127.38, fcase(test_vec_na1, "1", test_vec_na2, "0"), as.character(out_vec_na))
test(2127.39, fcase(test_vec_na1, TRUE, test_vec_na2, FALSE), as.logical(out_vec_na))
test(2127.40, fcase(test_vec_na1, 1+0i, test_vec_na2, 0+0i), as.complex(out_vec_na))
test(2127.41, fcase(test_vec_na1, list(1), test_vec_na2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0,NULL))
test(2127.42, fcase(c(TRUE,TRUE,TRUE,FALSE,FALSE),factor(NA,levels=letters[1:5]),c(FALSE,FALSE,FALSE,TRUE,TRUE),factor(letters[1:5])),factor(c(NA,NA,NA,"d","e"),levels=letters[1:5]))
test(2127.43, fcase(c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(NA,levels=letters[1:6]),c(FALSE,FALSE,FALSE,TRUE,NA,TRUE),factor(letters[1:6])),factor(c(NA,NA,NA,"d",NA,"f"),levels=letters[1:6]))
test(2127.44, fcase(c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(letters[1:6]),c(FALSE,FALSE,FALSE,TRUE,NA,TRUE),factor(NA,levels = letters[1:6])),factor(c("a","b","c",NA,NA,NA),levels=letters[1:6]))
test(2127.45, fcase(c(TRUE,NA,TRUE,FALSE,FALSE,FALSE),factor(NA),c(TRUE,TRUE,TRUE,FALSE,NA,FALSE),factor(NA)),factor(c(NA,NA,NA,NA,NA,NA)))
test(2127.46, fcase(TRUE, list(data.table(1:5)), FALSE, list(data.table(5:1))), list(data.table(1:5)))
test(2127.47, fcase(FALSE, list(data.table(1:5)), TRUE, list(data.table(5:1))), list(data.table(5:1)))
test(2127.48, fcase(TRUE, list(data.frame(1:5)), FALSE, list(data.frame(5:1))), list(data.frame(1:5)))
test(2127.49, fcase(FALSE, list(data.frame(1:5)), TRUE, list(data.frame(5:1))), list(data.frame(5:1)))
test(2127.50, fcase(1L,1L,TRUE,0L), error = "Argument #1 must be logical.")
test(2127.51, fcase(TRUE,1L,5L,0L), 1L)
test(2127.52, fcase(test_vec1, 1L, test_vec2, 0L, test_vec3, 2L), as.integer(out_vec_def))
test(2127.53, fcase(test_vec1, 1, test_vec2, 0, test_vec3, 2), out_vec_def)
test(2127.54, fcase(test_vec1, "1", test_vec2, "0", test_vec3, "2"), as.character(out_vec_def))
test(2127.55, fcase(test_vec1, TRUE, test_vec2, FALSE, test_vec3, TRUE), as.logical(out_vec_def))
test(2127.56, fcase(test_vec1, 1+0i, test_vec2, 0+0i, test_vec3, 2+0i), as.complex(out_vec_def))
test(2127.57, fcase(test_vec1, list(1), test_vec2, list(0), test_vec3, list(2)), list(1,1,1,1,1, 2, 0, 0, 0, 0, 0))
test(2127.58, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, as.Date("2019-10-14"), test_vec3, as.Date("2019-10-15")), c(rep(as.Date("2019-10-11"),5),as.Date("2019-10-15"),rep(as.Date("2019-10-14"),5)))
test(2127.59, fcase(test_vec1, factor("a", levels=letters[1:3]), test_vec2, factor("b", levels=letters[1:3]), test_vec3, factor("c", levels=letters[1:3])), factor(c(rep("a",5),"c",rep("b",5)), levels=letters[1:3]))
test(2127.60, fcase(test_vec1, 1L), as.integer(out_vec_oc))
test(2127.61, fcase(test_vec1, 1), out_vec_oc)
test(2127.62, fcase(test_vec1, "1"), as.character(out_vec_oc))
test(2127.63, fcase(test_vec1, TRUE), as.logical(out_vec_oc))
test(2127.64, fcase(test_vec1, 1+0i), as.complex(out_vec_oc))
test(2127.65, fcase(test_vec1, list(1)), list(1,1,1,1,1, NULL, NULL, NULL, NULL, NULL, NULL))
test(2127.66, fcase(test_vec1, as.Date("2019-10-11")), c(rep(as.Date("2019-10-11"),5),rep(NA,6)))
test(2127.67, fcase(test_vec1, factor("a", levels=letters[1:3])), factor(c(rep("a",5),rep("NA",6)), levels=letters[1:3]))
test(2127.68, fcase(test_vec1, 1L, default = 1:2), error = "Length of 'default' must be 1.")
test(2127.69, fcase(test_vec1, 1L, test_vec_na1, 2L), error = "Argument #3 has a different length than argument #1. Please make sure all logical conditions have the same length.")
test(2127.70, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, 2), error = "Argument #4 has different class than argument #2, Please make sure all output values have the same class.")
test(2127.71, fcase(test_vec1, 1L, test_vec2, 2:3), error = "Length of output value #4 must either be 1 or length of logical condition.")
test(2127.72, fcase(TRUE, 1L, FALSE, stop("bang!")), 1L)
test(2127.73, fcase(test_vec1, 1L, test_vec2, 0:10), as.integer(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10)))
test(2127.74, fcase(test_vec1, 0:10, test_vec2, 0L), as.integer(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0)))
test(2127.75, fcase(test_vec1, 1, test_vec2, as.numeric(0:10)), as.numeric(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10)))
test(2127.76, fcase(test_vec1, as.numeric(0:10), test_vec2, 0), as.numeric(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0)))
test(2127.77, fcase(test_vec1, "1", test_vec2, as.character(0:10)), as.character(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10)))
test(2127.78, fcase(test_vec1, as.character(0:10), test_vec2, "0"), as.character(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0)))
test(2127.79, fcase(test_vec1, TRUE, test_vec2, rep(FALSE, 11L)), as.logical(out_vec))
test(2127.80, fcase(test_vec1, rep(TRUE, 11L), test_vec2, FALSE), as.logical(out_vec))
test(2127.81, fcase(test_vec1, 1+0i, test_vec2, rep(0+0i, 11L)), as.complex(out_vec))
test(2127.82, fcase(test_vec1, rep(1+0i, 11L), test_vec2, 0+0i), as.complex(out_vec))
test(2127.83, fcase(test_vec1, list(rep(1, 11L)), test_vec2, list(0)), list(rep(1, 11L),rep(1, 11L),rep(1, 11L),rep(1, 11L),rep(1, 11L), NULL, 0, 0, 0, 0, 0))
test(2127.84, fcase(test_vec1, list(1), test_vec2, list(rep(0,11L))), list(1,1,1,1,1, NULL, rep(0,11L), rep(0,11L), rep(0,11L), rep(0,11L), rep(0,11L)))
test(2127.85, fcase(test_vec1, list(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), test_vec2, list(0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))
test(2127.86, fcase(test_vec1, list(1), test_vec2, list(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), list(1,1,1,1,1, NULL, 0, 0, 0, 0, 0))

###################################
# Add new tests above this line #
Expand Down
58 changes: 58 additions & 0 deletions man/fcase.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
\name{fcase}
\alias{fcase}
\title{fcase}
\description{
\code{fcase} is a fast implementation of SQL \code{CASE WHEN} statement for R. Conceptually, \code{fcase} is a nested version of \code{\link{fifelse}} (with smarter implementation than manual nesting). It is comparable to \code{dplyr::case_when} and supports \code{bit64}'s \code{integer64} and \code{nanotime} classes.
}
\usage{
fcase(..., default=NA)
}
\arguments{
\item{...}{ A sequence consisting of logical condition (\code{when})-resulting value (\code{value}) \emph{pairs} in the following order \code{when1, value1, when2, value2, ..., whenN, valueN}. Logical conditions \code{when1, when2, ..., whenN} must all have the same length, type and attributes. Each \code{value} may either share length with \code{when} or be length 1. Please see Examples section for further details.}
\item{default}{ Default return value, \code{NA} by default, for when all of the logical conditions \code{when1, when2, ..., whenN} are \code{FALSE} or missing for some entries. }
}
\value{
Vector with the same length as the logical conditions (\code{when}) in \code{...}, filled with the corresponding values (\code{value}) from \code{...}, or eventually \code{default}. Attributes of output values \code{value1, value2, ...valueN} in \code{...} are preserved.
}
\seealso{
\code{\link{fifelse}}
}
\examples{
x = 1:10
fcase(
x < 5L, 1L,
x > 5L, 3L
)
fcase(
x < 5L, 1L:10L,
x > 5L, 3L:12L
)
# Lazy evaluation example
fcase(
x < 5L, 1L,
x >= 5L, 3L,
x == 5L, stop("provided value is an unexpected one!")
)
# fcase preserves attributes, example with dates
fcase(
x < 5L, as.Date("2019-10-11"),
x > 5L, as.Date("2019-10-14")
)
# fcase example with factor; note the matching levels
fcase(
x < 5L, factor("a", levels=letters[1:3]),
x > 5L, factor("b", levels=letters[1:3])
)
# Example of using the 'default' argument
fcase(
x < 5L, 1L,
x > 5L, 3L,
default = 5L
)
}
\keyword{ data }
4 changes: 4 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,7 @@ SEXP coerceUtf8IfNeeded(SEXP x);
char *end(char *start);
void ansMsg(ans_t *ans, int n, bool verbose, const char *func);
SEXP testMsgR(SEXP status, SEXP x, SEXP k);

//fifelse.c
SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
Loading

0 comments on commit 5742d79

Please sign in to comment.