Skip to content

Add bounds argument to geom_density() #4013

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 26, 2022
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# ggplot2 (development version)

* `geom_density()` and `stat_density()` now support `bounds` argument
to estimate density with boundary correction (@echasnovski, #4013).

* ggplot now checks during statistical transformations whether any data
columns were dropped and warns about this. If stats intend to drop
data columns they can declare them in the new field `dropped_aes`.
Expand Down
6 changes: 6 additions & 0 deletions R/geom-density.r
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@
#' geom_density(alpha = 0.1) +
#' xlim(55, 70)
#'
#' # Use `bounds` to adjust computation for known data limits
#' big_diamonds <- diamonds[diamonds$carat >= 1, ]
#' ggplot(big_diamonds, aes(carat)) +
#' geom_density(color = 'red') +
#' geom_density(bounds = c(1, Inf), color = 'blue')
#'
#' \donttest{
#' # Stacked density plots: if you want to create a stacked density plot, you
#' # probably want to 'count' (density * n) variable instead of the default
Expand Down
90 changes: 85 additions & 5 deletions R/stat-density.r
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
#' not line-up, and hence you won't be able to stack density values.
#' This parameter only matters if you are displaying multiple densities in
#' one plot or if you are manually adjusting the scale limits.
#' @param bounds Known lower and upper bounds for estimated data. Default
#' `c(-Inf, Inf)` means that there are no (finite) bounds. If any bound is
#' finite, boundary effect of default density estimation will be corrected by
#' reflecting tails outside `bounds` around their closest edge. Data points
#' outside of bounds are removed with a warning.
#' @section Computed variables:
#' \describe{
#' \item{density}{density estimate}
Expand All @@ -36,6 +41,7 @@ stat_density <- function(mapping = NULL, data = NULL,
n = 512,
trim = FALSE,
na.rm = FALSE,
bounds = c(-Inf, Inf),
orientation = NA,
show.legend = NA,
inherit.aes = TRUE) {
Expand All @@ -55,6 +61,7 @@ stat_density <- function(mapping = NULL, data = NULL,
n = n,
trim = trim,
na.rm = na.rm,
bounds = bounds,
orientation = orientation,
...
)
Expand All @@ -70,6 +77,8 @@ StatDensity <- ggproto("StatDensity", Stat,

default_aes = aes(x = after_stat(density), y = after_stat(density), fill = NA, weight = NULL),

dropped_aes = "weight",

setup_params = function(self, data, params) {
params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE)

Expand All @@ -85,7 +94,8 @@ StatDensity <- ggproto("StatDensity", Stat,
extra_params = c("na.rm", "orientation"),

compute_group = function(data, scales, bw = "nrd0", adjust = 1, kernel = "gaussian",
n = 512, trim = FALSE, na.rm = FALSE, flipped_aes = FALSE) {
n = 512, trim = FALSE, na.rm = FALSE, bounds = c(-Inf, Inf),
flipped_aes = FALSE) {
data <- flip_data(data, flipped_aes)
if (trim) {
range <- range(data$x, na.rm = TRUE)
Expand All @@ -94,22 +104,30 @@ StatDensity <- ggproto("StatDensity", Stat,
}

density <- compute_density(data$x, data$weight, from = range[1],
to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n)
to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n,
bounds = bounds)
density$flipped_aes <- flipped_aes
flip_data(density, flipped_aes)
}

)

compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
kernel = "gaussian", n = 512) {
kernel = "gaussian", n = 512,
bounds = c(-Inf, Inf)) {
nx <- length(x)
if (is.null(w)) {
w <- rep(1 / nx, nx)
} else {
w <- w / sum(w)
}

# Adjust data points and weights to all fit inside bounds
sample_data <- fit_data_to_bounds(bounds, x, w)
x <- sample_data$x
w <- sample_data$w
nx <- length(x)

# if less than 2 points return data frame of NAs and a warning
if (nx < 2) {
cli::cli_warn("Groups with fewer than two data points have been dropped.")
Expand All @@ -124,8 +142,16 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
))
}

dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
kernel = kernel, n = n, from = from, to = to)
# Decide whether to use boundary correction
if (any(is.finite(bounds))) {
dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
kernel = kernel, n = n)

dens <- reflect_density(dens = dens, bounds = bounds, from = from, to = to)
} else {
dens <- stats::density(x, weights = w, bw = bw, adjust = adjust,
kernel = kernel, n = n, from = from, to = to)
}

data_frame0(
x = dens$x,
Expand All @@ -137,3 +163,57 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
.size = length(dens$x)
)
}

# Check if all data points are inside bounds. If not, warn and remove them.
fit_data_to_bounds <- function(bounds, x, w) {
is_inside_bounds <- (bounds[1] <= x) & (x <= bounds[2])

if (any(!is_inside_bounds)) {
cli::cli_warn("Some data points are outside of `bounds`. Removing them.")
x <- x[is_inside_bounds]
w <- w[is_inside_bounds]
w_sum <- sum(w)
if (w_sum > 0) {
w <- w / w_sum
}
}

return(list(x = x, w = w))
}

# Update density estimation to mitigate boundary effect at known `bounds`:
# - All x values will lie inside `bounds`.
# - All y-values will be updated to have total probability of `bounds` be
# closer to 1. This is done by reflecting tails outside of `bounds` around
# their closest edge. This leads to those tails lie inside of `bounds`
# (completely, if they are not wider than `bounds` itself, which is a common
# situation) and correct boundary effect of default density estimation.
#
# `dens` - output of `stats::density`.
# `bounds` - two-element vector with left and right known (user supplied)
# bounds of x values.
# `from`, `to` - numbers used as corresponding arguments of `stats::density()`
# in case of no boundary correction.
reflect_density <- function(dens, bounds, from, to) {
# No adjustment is needed if no finite bounds are supplied
if (all(is.infinite(bounds))) {
return(dens)
}

# Estimate linearly with zero tails (crucial to account for infinite bound)
f_dens <- stats::approxfun(
x = dens$x, y = dens$y, method = "linear", yleft = 0, yright = 0
)

# Create a uniform x-grid inside `bounds`
left <- max(from, bounds[1])
right <- min(to, bounds[2])
out_x <- seq(from = left, to = right, length.out = length(dens$x))

# Update density estimation by adding reflected tails from outside `bounds`
left_reflection <- f_dens(bounds[1] + (bounds[1] - out_x))
right_reflection <- f_dens(bounds[2] + (bounds[2] - out_x))
out_y <- f_dens(out_x) + left_reflection + right_reflection

list(x = out_x, y = out_y)
}
13 changes: 13 additions & 0 deletions man/geom_density.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 91 additions & 0 deletions tests/testthat/test-stat-density.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,94 @@
test_that("stat_density actually computes density", {
# Compare functon approximations because outputs from `ggplot()` and
# `density()` give grids spanning different ranges
dens <- stats::density(mtcars$mpg)
expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))

plot <- ggplot(mtcars, aes(mpg)) + stat_density()
actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])

test_sample <- unique(mtcars$mpg)
expect_equal(
expected_density_fun(test_sample),
actual_density_fun(test_sample),
tolerance = 1e-3
)
})

test_that("stat_density can make weighted density estimation", {
df <- mtcars
df$weight <- mtcars$cyl / sum(mtcars$cyl)

dens <- stats::density(df$mpg, weights = df$weight)
expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))

plot <- ggplot(df, aes(mpg, weight = weight)) + stat_density()
actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])

test_sample <- unique(df$mpg)
expect_equal(
expected_density_fun(test_sample),
actual_density_fun(test_sample),
tolerance = 1e-3
)
})

test_that("stat_density uses `bounds`", {
mpg_min <- min(mtcars$mpg)
mpg_max <- max(mtcars$mpg)

expect_bounds <- function(bounds) {
dens <- stats::density(mtcars$mpg)
orig_density <- stats::approxfun(
data.frame(x = dens$x, y = dens$y),
yleft = 0,
yright = 0
)

bounded_plot <- ggplot(mtcars, aes(mpg)) + stat_density(bounds = bounds)
bounded_data <- layer_data(bounded_plot)[, c("x", "y")]
plot_density <- stats::approxfun(bounded_data, yleft = 0, yright = 0)

test_sample <- seq(mpg_min, mpg_max, by = 0.1)
left_reflection <- orig_density(bounds[1] + (bounds[1] - test_sample))
right_reflection <- orig_density(bounds[2] + (bounds[2] - test_sample))

# Plot density should be an original plus added reflection at both `bounds`
# (reflection around infinity is zero)
expect_equal(
orig_density(test_sample) + left_reflection + right_reflection,
plot_density(test_sample),
tolerance = 1e-4
)
}

expect_bounds(c(-Inf, Inf))
expect_bounds(c(mpg_min, Inf))
expect_bounds(c(-Inf, mpg_max))
expect_bounds(c(mpg_min, mpg_max))
})

test_that("stat_density handles data outside of `bounds`", {
cutoff <- mtcars$mpg[1]

# Both `x` and `weight` should be filtered out for out of `bounds` points
expect_warning(
data_actual <- layer_data(
ggplot(mtcars, aes(mpg, weight = cyl)) +
stat_density(bounds = c(cutoff, Inf))
),
"outside of `bounds`"
)

mtcars_filtered <- mtcars[mtcars$mpg >= cutoff, ]
data_expected <- layer_data(
ggplot(mtcars_filtered, aes(mpg, weight = cyl)) +
stat_density(bounds = c(cutoff, Inf))
)

expect_equal(data_actual, data_expected)
})

test_that("compute_density succeeds when variance is zero", {
dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5)
expect_equal(dens$n, rep(10, 512))
Expand Down