Skip to content

Commit f523905

Browse files
authored
Merge branch 'main' into f-167-lubridate-month
2 parents c346fb5 + 23f1dd3 commit f523905

File tree

7 files changed

+217
-30
lines changed

7 files changed

+217
-30
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: duckplyr
33
Title: A 'DuckDB'-Backed Version of 'dplyr'
4-
Version: 1.0.1.9005
4+
Version: 1.0.1.9006
55
Authors@R: c(
66
person("Hannes", "Mühleisen", role = "aut",
77
comment = c(ORCID = "0000-0001-8552-0029")),
@@ -63,7 +63,7 @@ Config/Needs/development:
6363
qs,
6464
reprex,
6565
r-lib/roxygen2,
66-
anthonynorth/roxyglobals,
66+
roxyglobals,
6767
rstudioapi,
6868
tidyverse
6969
Config/Needs/website:

NEWS.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
<!-- NEWS.md is maintained by https://fledge.cynkra.com, contributors should not edit this file -->
22

3+
# duckplyr 1.0.1.9006 (2025-03-27)
4+
5+
## Features
6+
7+
- Implement `n_distinct()` as macro with support for `na.rm = TRUE` (@joakimlinde, #572, #655).
8+
9+
## Chore
10+
11+
- Use roxyglobals from CRAN rather than GitHub (@andreranza, #659).
12+
13+
- Space at EOL.
14+
15+
## Documentation
16+
17+
- Recommend `pak::pak()`.
18+
19+
320
# duckplyr 1.0.1.9005 (2025-03-12)
421

522
## Features

R/relational-duckdb.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ duckplyr_macros <- c(
5050
"|" = "(x, y) AS (x OR y)",
5151
"&" = "(x, y) AS (x AND y)",
5252
"!" = "(x) AS (NOT x)",
53-
"n_distinct" = "(x) AS (COUNT(DISTINCT x))",
5453
#
5554
"wday" = "(x) AS CAST(weekday(CAST (x AS DATE)) + 1 AS int32)",
5655
#
@@ -69,6 +68,10 @@ duckplyr_macros <- c(
6968
"___sd_na" = "(x) AS (CASE WHEN SUM(CASE WHEN x IS NULL THEN 1 ELSE 0 END) > 0 THEN NULL ELSE STDDEV(x) END)",
7069
"___median_na" = "(x) AS (CASE WHEN SUM(CASE WHEN x IS NULL THEN 1 ELSE 0 END) > 0 THEN NULL ELSE percentile_cont(0.5) WITHIN GROUP (ORDER BY x) END)",
7170
#
71+
# In n_distinct() many NAs count as 1 if not filtered out with na.rm = TRUE
72+
"___n_distinct_na" = "(x) AS (CASE WHEN SUM(CASE WHEN x IS NULL THEN 1 ELSE 0 END) > 0 THEN (COUNT(DISTINCT x)+1) ELSE COUNT(DISTINCT x) END)",
73+
"___n_distinct" = "(x) AS (COUNT(DISTINCT x))",
74+
#
7275
NULL
7376
)
7477

R/translate.R

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ rel_translate_lang <- function(
180180
}
181181

182182

183-
if (!(name %in% c("wday", "strftime", "lag", "lead", "sum", "min", "max", "any", "all", "mean", "median", "sd"))) {
183+
if (!(name %in% c("wday", "strftime", "lag", "lead", "sum", "min", "max", "any", "all", "mean", "median", "sd", "n_distinct"))) {
184184
if (!is.null(names(expr)) && any(names(expr) != "")) {
185185
# Fix grepl() and sum()/min()/max() logic below when allowing matching by argument name
186186
cli::cli_abort("Can't translate named argument {.code {name}({names(expr)[names(expr) != ''][[1]]} = )}.", call = call)
@@ -297,6 +297,7 @@ rel_translate_lang <- function(
297297

298298
# Aggregates
299299
"sum", "min", "max", "any", "all", "mean", "sd", "median",
300+
"n_distinct",
300301
#
301302
NULL
302303
)
@@ -335,7 +336,7 @@ rel_translate_lang <- function(
335336

336337
# Other primitives: prod, range
337338
# Other aggregates: var(), cum*(), quantile()
338-
if (name %in% c("sum", "min", "max", "any", "all", "mean", "sd", "median")) {
339+
if (name %in% c("sum", "min", "max", "any", "all", "mean", "sd", "median", "n_distinct")) {
339340
is_primitive <- (name %in% c("sum", "min", "max", "any", "all"))
340341

341342
if (is_primitive) {
@@ -366,18 +367,26 @@ rel_translate_lang <- function(
366367
}
367368

368369
if (window) {
369-
if (identical(na_rm, FALSE)) {
370-
cli::cli_abort(call = call, c(
371-
"{.code {name}(na.rm = FALSE)} not supported in window functions",
372-
i = "Use {.code {name}(na.rm = TRUE)} after checking for missing values"
373-
))
374-
} else if (!identical(na_rm, TRUE)) {
375-
cli::cli_abort("Invalid value for {.arg na.rm} in call to {.fun {name}}", call = call)
370+
if (name == "n_distinct") {
371+
cli::cli_abort("{.code {name}()} not supported in window functions", call = call)
372+
} else {
373+
if (identical(na_rm, FALSE)) {
374+
cli::cli_abort(call = call, c(
375+
"{.code {name}(na.rm = FALSE)} not supported in window functions",
376+
i = "Use {.code {name}(na.rm = TRUE)} after checking for missing values"
377+
))
378+
} else if (!identical(na_rm, TRUE)) {
379+
cli::cli_abort("Invalid value for {.arg na.rm} in call to {.fun {name}}", call = call)
380+
}
376381
}
377382
} else {
378383
if (identical(na_rm, FALSE)) {
379384
aliased_name <- paste0("___", name, "_na") # ___sum_na, ___min_na, ___max_na
380-
} else if (!identical(na_rm, TRUE)) {
385+
} else if (identical(na_rm, TRUE)) {
386+
if (name == "n_distinct") {
387+
aliased_name <- paste0("___", name)
388+
}
389+
} else {
381390
cli::cli_abort("Invalid value for {.arg na.rm} in call to {.fun {name}}", call = call)
382391
}
383392
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# duckdb n_distinct() error with more than one argument
2+
3+
Code
4+
df %>% summarise(dummy = n_distinct(a, b))
5+
Condition
6+
Error in `summarise()`:
7+
! `n_distinct()` needs exactly one argument besides the optional `na.rm`
8+
9+
# duckdb n_distinct() error with na.rm not being TRUE/FALSE
10+
11+
Code
12+
df %>% summarise(dummy = n_distinct(a, na.rm = "b"))
13+
Condition
14+
Error in `summarise()`:
15+
! Invalid value for `na.rm` in call to `n_distinct()`
16+
17+
# duckdb n_distinct() error with mutate
18+
19+
Code
20+
df %>% mutate(dummy = n_distinct(a))
21+
Condition
22+
Error in `mutate()`:
23+
! `n_distinct()` not supported in window functions
24+

tests/testthat/test-n_distinct.R

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
test_that("duckdb n_distinct() basic", {
2+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
3+
4+
df <- duckdb_tibble(
5+
a = c(1, 1, 2, 2, 2),
6+
b = c(3, 3, NA, 3, 3)
7+
)
8+
9+
out <- df %>%
10+
summarise( n_distinct_a = n_distinct(a),
11+
n_distinct_a_na_rm = n_distinct(a, na.rm = TRUE),
12+
n_distinct_b = n_distinct(b, na.rm = FALSE),
13+
n_distinct_b_na_rm = n_distinct(b, na.rm = TRUE)
14+
)
15+
16+
expect_equal(out$n_distinct_a, 2)
17+
expect_equal(out$n_distinct_a_na_rm, 2)
18+
expect_equal(out$n_distinct_b, 2)
19+
expect_equal(out$n_distinct_b_na_rm, 1)
20+
})
21+
22+
23+
test_that("duckdb n_distinct() counts empty inputs", {
24+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
25+
26+
df <- duckdb_tibble(
27+
a = integer(),
28+
b = double(),
29+
c = logical(),
30+
d = character()
31+
)
32+
33+
out <- df %>%
34+
summarise( n_distinct_a = n_distinct(a),
35+
n_distinct_b = n_distinct(b),
36+
n_distinct_c = n_distinct(c),
37+
n_distinct_d = n_distinct(d),
38+
)
39+
40+
expect_equal(out$n_distinct_a, 0)
41+
expect_equal(out$n_distinct_b, 0)
42+
expect_equal(out$n_distinct_c, 0)
43+
expect_equal(out$n_distinct_d, 0)
44+
})
45+
46+
47+
test_that("duckdb n_distinct() counts unique values in simple vectors", {
48+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
49+
50+
df <- duckdb_tibble(
51+
a = c(TRUE, FALSE, NA),
52+
b = c(1, 2, NA),
53+
c = c(1L, 2L, NA),
54+
d = c("x", "y", NA)
55+
)
56+
57+
out <- df %>%
58+
summarise( n_distinct_a = n_distinct(a),
59+
n_distinct_b = n_distinct(b),
60+
n_distinct_c = n_distinct(c),
61+
n_distinct_d = n_distinct(d),
62+
)
63+
64+
expect_equal(out$n_distinct_a, 3)
65+
expect_equal(out$n_distinct_b, 3)
66+
expect_equal(out$n_distinct_c, 3)
67+
expect_equal(out$n_distinct_d, 3)
68+
})
69+
70+
71+
test_that("duckdb n_distinct() can drop missing values", {
72+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
73+
74+
df <- duckdb_tibble(
75+
a = c(NA),
76+
b = c(NA, 0),
77+
)
78+
79+
out <- df %>%
80+
summarise( n_distinct_a = n_distinct(a, na.rm = TRUE),
81+
n_distinct_b = n_distinct(b, na.rm = TRUE),
82+
)
83+
84+
expect_equal(out$n_distinct_a, 0)
85+
expect_equal(out$n_distinct_b, 1)
86+
})
87+
88+
89+
test_that("duckdb n_distinct() counts NA correctly", {
90+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
91+
92+
df <- duckdb_tibble(
93+
a = c(1, NA, 1, NA, 2, NA, 2),
94+
b = c(3, 3, NA, 3, NA, 4, 5)
95+
)
96+
97+
out <- df %>%
98+
summarise( n_distinct_a = n_distinct(a),
99+
n_distinct_a_na_rm = n_distinct(a, na.rm = TRUE),
100+
n_distinct_b = n_distinct(b, na.rm = FALSE),
101+
n_distinct_b_na_rm = n_distinct(b, na.rm = TRUE)
102+
)
103+
104+
expect_equal(out$n_distinct_a, 3)
105+
expect_equal(out$n_distinct_a_na_rm, 2)
106+
expect_equal(out$n_distinct_b, 4)
107+
expect_equal(out$n_distinct_b_na_rm, 3)
108+
})
109+
110+
111+
test_that("duckdb n_distinct() error with more than one argument", {
112+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
113+
114+
df <- duckdb_tibble(
115+
a = c(1, 1, 2, 2, 2),
116+
b = c(3, 3, NA, 3, 3)
117+
)
118+
119+
expect_snapshot( error = TRUE, {
120+
df %>% summarise( dummy = n_distinct(a, b) )
121+
})
122+
})
123+
124+
125+
test_that("duckdb n_distinct() error with na.rm not being TRUE/FALSE", {
126+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
127+
128+
df <- duckdb_tibble(
129+
a = c(1, 2),
130+
)
131+
132+
expect_snapshot( error = TRUE, {
133+
df %>% summarise( dummy = n_distinct(a, na.rm = "b") )
134+
})
135+
})
136+
137+
138+
test_that("duckdb n_distinct() error with mutate", {
139+
withr::local_envvar(DUCKPLYR_FORCE = TRUE)
140+
141+
df <- duckdb_tibble(
142+
a = c(1, 1, 2, 2, 2),
143+
b = c(3, 3, NA, 3, 3)
144+
)
145+
146+
expect_snapshot( error = TRUE, {
147+
df %>% mutate( dummy = n_distinct(a) )
148+
})
149+
})

vignettes/limits.Rmd

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -389,24 +389,9 @@ tibble(a = c(TRUE, FALSE)) |>
389389
summarize(min(a), max(a))
390390
```
391391

392-
### `n_distinct()` and missing values
392+
### `n_distinct()` and multiple arguments
393393

394-
Unlike most other aggregation functions, `n_distinct()` ignores missing values and does not support the `na.rm` argument.
395-
This is tracked in <https://github.com/tidyverse/duckplyr/issues/572>.
396-
397-
```{r error = TRUE}
398-
duckplyr::duckdb_tibble(a = c(1, 2, NA, 1)) |>
399-
summarize(n_distinct(a))
400-
401-
duckplyr::duckdb_tibble(a = c(1, 2, NA, 1), .prudence = "stingy") |>
402-
summarize(n_distinct(a, na.rm = TRUE))
403-
404-
tibble(a = c(1, 2, NA, 1)) |>
405-
summarize(n_distinct(a))
406-
407-
tibble(a = c(1, 2, NA, 1)) |>
408-
summarize(n_distinct(a, na.rm = TRUE))
409-
```
394+
This function needs exactly one argument besides the optional `na.rm`. Multiple arguments is not supported.
410395

411396
### `is.na()` and `NaN` values
412397

0 commit comments

Comments
 (0)