Skip to content

Commit c9dada7

Browse files
committed
Overhaul stat_bin to use @rprium's methods.
Fixes #1477.
1 parent 8c87810 commit c9dada7

File tree

7 files changed

+336
-111
lines changed

7 files changed

+336
-111
lines changed

NEWS.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,26 @@
11
# ggplot2 2.0.0.9000
22

3+
* `stat_bin()` has been overhauled to use the same algorithm as ggvis, which
4+
has been considerably improved thanks to the advice of Randy Prium (@rpruim).
5+
This includes:
6+
7+
* Better arguments and a better algorithm for determining the origin.
8+
You can now specify either `boundary` or the `center` of a bin.
9+
`origin` has been deprecated in favour of these arguments.
10+
11+
* `drop` is deprecated in favour of `pad`, which adds extra 0-count bins
12+
at either end. This is needed for frequency polygons. `geom_histogram()`
13+
defaults to `pad = FALSE` which considerably improves the default limits
14+
for the histogram, especially when the bins are big (#1477).
15+
16+
* The default algorithm does a better job at picking nice width and
17+
origins across a wider range of input data.
18+
19+
This change brings with the removal of some rarely used features:
20+
21+
* The `breaks` argument is no longer supported - this was rarely used
22+
and didn't work correctly for frequency polygons.
23+
324
* `geom_tile()` uses `draw_key_polygon()` for better legend keys, including
425
coloured outline (#1484).
526

R/geom-freqpoly.r

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ geom_freqpoly <- function(mapping = NULL, data = NULL, stat = "bin",
1313
inherit.aes = inherit.aes,
1414
params = list(
1515
na.rm = na.rm,
16+
pad = TRUE,
1617
...
1718
)
1819
)

R/geom-histogram.r

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@
7070
#' m + geom_histogram(binwidth = 0.5) + scale_y_sqrt()
7171
#' }
7272
#' rm(movies)
73-
geom_histogram <- function(mapping = NULL, data = NULL, stat = "bin",
74-
binwidth = NULL, bins = NULL, origin = NULL,
75-
right = FALSE, position = "stack", na.rm = FALSE,
76-
show.legend = NA, inherit.aes = TRUE, ...) {
73+
geom_histogram <- function(mapping = NULL, data = NULL,
74+
stat = "bin", position = "stack", ...,
75+
binwidth = NULL, bins = NULL,
76+
na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
7777

7878
layer(
7979
data = data,
@@ -86,9 +86,8 @@ geom_histogram <- function(mapping = NULL, data = NULL, stat = "bin",
8686
params = list(
8787
binwidth = binwidth,
8888
bins = bins,
89-
origin = origin,
90-
right = right,
9189
na.rm = na.rm,
90+
pad = FALSE,
9291
...
9392
)
9493
)

R/stat-bin.r

Lines changed: 190 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,29 @@
11
#' \code{stat_bin} is suitable only for continuous x data. If your x data is
22
#' discrete, you probably want to use \code{\link{stat_count}}.
33
#'
4-
#' @param binwidth Bin width to use. Defaults to 1/\code{bins} of the range of
5-
#' the data
6-
#' @param bins Number of bins. Overridden by \code{binwidth} or \code{breaks}.
7-
#' Defaults to 30
8-
#' @param breaks Actual breaks to use. Overrides bin width, bin number and
9-
#' origin
10-
#' @param origin Origin of first bin
11-
#' @param width Width of bars when used with categorical data
12-
#' @param right If \code{TRUE}, right-closed, left-open, if \code{FALSE},
13-
#' the default, right-open, left-closed.
14-
#' @param drop If TRUE, remove all bins with zero counts
4+
#' @param binwidth The width of the bins. The default is to use \code{bins}
5+
#' bins that cover the range of the data. You should always override
6+
#' this value, exploring multiple widths to find the best to illustrate the
7+
#' stories in your data.
8+
#'
9+
#' The bin width of a date variable is the number of days in each time; the
10+
#' bin width of a time variable is the number of seconds.
11+
#' @param bins Number of bins. Overridden by \code{binwidth}. Defaults to 30
12+
#' @param center The center of one of the bins. Note that if center is above or
13+
#' below the range of the data, things will be shifted by an appropriate
14+
#' number of \code{width}s. To center on integers, for example, use
15+
#' \code{width=1} and \code{center=0}, even if \code{0} is outside the range
16+
#' of the data. At most one of \code{center} and \code{boundary} may be
17+
#' specified.
18+
#' @param boundary A boundary between two bins. As with \code{center}, things
19+
#' are shifted when \code{boundary} is outside the range of the data. For
20+
#' example, to center on integers, use \code{width = 1} and \code{boundary =
21+
#' 0.5}, even if \code{1} is outside the range of the data. At most one of
22+
#' \code{center} and \code{boundary} may be specified.
23+
#' @param closed One of \code{"right"} or \code{"left"} indicating whether right
24+
#' or left edges of bins are included in the bin.
25+
#' @param pad If \code{TRUE}, adds empty bins at either end of x. This ensures
26+
#' frequency polygons touch 0. Defaults to \code{FALSE}.
1527
#' @section Computed variables:
1628
#' \describe{
1729
#' \item{count}{number of points in bin}
@@ -26,10 +38,11 @@
2638
#' @export
2739
#' @rdname geom_histogram
2840
stat_bin <- function(mapping = NULL, data = NULL, geom = "bar",
29-
position = "stack", width = 0.9, drop = FALSE,
30-
right = FALSE, binwidth = NULL, bins = NULL, origin = NULL,
31-
breaks = NULL, na.rm = FALSE,
32-
show.legend = NA, inherit.aes = TRUE, ...) {
41+
position = "stack", ...,
42+
binwidth = NULL, bins = NULL, center = NULL, boundary = NULL,
43+
closed = c("right", "left"), pad = FALSE,
44+
na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
45+
3346
layer(
3447
data = data,
3548
mapping = mapping,
@@ -39,13 +52,12 @@ stat_bin <- function(mapping = NULL, data = NULL, geom = "bar",
3952
show.legend = show.legend,
4053
inherit.aes = inherit.aes,
4154
params = list(
42-
width = width,
43-
drop = drop,
44-
right = right,
45-
bins = bins,
4655
binwidth = binwidth,
47-
origin = origin,
48-
breaks = breaks,
56+
bins = bins,
57+
center = center,
58+
boundary = boundary,
59+
closed = closed,
60+
pad = pad,
4961
na.rm = na.rm,
5062
...
5163
)
@@ -66,92 +78,192 @@ StatBin <- ggproto("StatBin", Stat,
6678
call. = FALSE)
6779
}
6880

81+
if (!is.null(params$drop)) {
82+
warning("`drop` is deprecated. Please use `pad` instead.", call. = FALSE)
83+
params$drop <- NULL
84+
}
85+
if (!is.null(params$origin)) {
86+
warning("`origin` is deprecated. Please use `boundary` instead.", call. = FALSE)
87+
params$boundary <- params$origin
88+
params$origin <- NULL
89+
}
90+
if (!is.null(params$right)) {
91+
warning("`right` is deprecated. Please use `closed` instead.", call. = FALSE)
92+
params$closed <- if (params$right) "right" else "left"
93+
params$right <- NULL
94+
}
95+
if (!is.null(params$breaks)) {
96+
stop("`breaks` is deprecated.", call. = FALSE)
97+
}
98+
if (!is.null(params$width)) {
99+
stop("`width` is deprecated. Do you want `geom_bar()`?", call. = FALSE)
100+
}
101+
if (!is.null(params$boundary) && !is.null(params$center)) {
102+
stop("Only one of `boundary` and `center` may be specified.", call. = FALSE)
103+
}
104+
105+
69106
if (is.null(params$breaks) && is.null(params$binwidth) && is.null(params$bins)) {
70107
message_wrap("`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.")
108+
params$bins <- 30
71109
}
72110

73111
params
74112
},
75113

76114
compute_group = function(data, scales, binwidth = NULL, bins = NULL,
77-
origin = NULL, breaks = NULL, width = 0.9, drop = FALSE,
78-
right = FALSE) {
79-
range <- scales$x$dimension()
115+
center = NULL, boundary = NULL,
116+
closed = c("right", "left"), pad = FALSE,
117+
# The following arguments are not used, but must
118+
# be listed so parameters are computed correctly
119+
breaks = NULL, origin = NULL, right = NULL,
120+
drop = NULL, width = NULL) {
80121

81-
bin(data$x, data$weight, binwidth = binwidth, bins = bins,
82-
origin = origin, breaks = breaks, range = range, width = width,
83-
drop = drop, right = right)
122+
params <- bin_params(
123+
scales$x$dimension(),
124+
width = binwidth,
125+
bins = bins,
126+
center = center,
127+
boundary = boundary,
128+
closed = closed
129+
)
130+
131+
bin_vector(data$x, weight = data$weight, width = params$width,
132+
origin = params$origin, closed = params$closed, pad = pad)
84133
},
85134

86135
default_aes = aes(y = ..count..),
87136
required_aes = c("x")
88137
)
89138

90-
bin <- function(x, weight=NULL, binwidth=NULL, bins=NULL, origin=NULL, breaks=NULL, range=NULL, width=0.9, drop = FALSE, right = FALSE) {
91139

92-
if (length(stats::na.omit(x)) == 0) return(data.frame())
93-
if (is.null(weight)) weight <- rep(1, length(x))
94-
weight[is.na(weight)] <- 0
140+
# Compute parameters -----------------------------------------------------------
95141

96-
if (is.null(range)) range <- range(x, na.rm = TRUE, finite = TRUE)
142+
bin_params <- function(x_range, width = NULL, bins = 30, center = NULL,
143+
boundary = NULL, closed = c("right", "left")) {
144+
closed <- match.arg(closed)
97145

98-
if (is.null(bins)) {
99-
bins <- 30
100-
} else {
101-
stopifnot(is.numeric(bins), length(bins) == 1, bins > 1)
146+
if (length(x_range) == 0) {
147+
return(list(width = width, origin = NULL, closed = closed))
102148
}
103149

104-
if (is.null(binwidth)) binwidth <- diff(range) / (bins - 1)
105-
106-
if (is.integer(x)) {
107-
bins <- x
108-
x <- sort(unique(bins))
109-
width <- width
110-
} else if (diff(range) == 0) {
111-
width <- width
112-
bins <- x
113-
} else {# if (is.numeric(x))
114-
if (is.null(breaks)) {
115-
if (is.null(origin)) {
116-
breaks <- fullseq(range, binwidth, pad = TRUE)
117-
} else {
118-
breaks <- seq(origin, max(range) + binwidth, binwidth)
119-
}
120-
}
150+
stopifnot(length(x_range) == 2)
151+
if (!is.null(boundary) && !is.null(center)) {
152+
stop("Only one of 'boundary' and 'center' may be specified.")
153+
}
154+
155+
if (is.null(width)) {
156+
width <- (x_range[2] - x_range[1]) / (bins - 1)
157+
}
158+
159+
if (is.null(boundary)) {
160+
if (is.null(center)) {
161+
# If neither edge nor center given, compute both using tile layer's
162+
# algorithm. This puts min and max of data in outer half of their bins.
163+
boundary <- width / 2
121164

122-
# Adapt break fuzziness from base::hist - this protects from floating
123-
# point rounding errors
124-
diddle <- 1e-07 * stats::median(diff(breaks))
125-
if (right) {
126-
fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1))
127165
} else {
128-
fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle)
166+
# If center given but not boundary, compute boundary.
167+
boundary <- center - width / 2
129168
}
130-
fuzzybreaks <- sort(breaks) + fuzz
169+
}
170+
171+
# Inputs could be Dates or POSIXct, so make sure these are all numeric
172+
x_range <- as.numeric(x_range)
173+
width <- as.numeric(width)
174+
boundary <- as.numeric(boundary)
175+
176+
origin <- find_origin(x_range, width, boundary)
131177

132-
bins <- cut(x, fuzzybreaks, include.lowest = TRUE, right = right)
133-
left <- breaks[-length(breaks)]
134-
right <- breaks[-1]
135-
x <- (left + right)/2
136-
width <- diff(breaks)
178+
list(width = width, origin = origin, closed = closed)
179+
}
180+
181+
# Find the left side of left-most bin
182+
find_origin <- function(x_range, width, boundary) {
183+
shift <- floor((x_range[1] - boundary) / width)
184+
boundary + shift * width
185+
}
186+
187+
bin_vector <- function(x, weight = NULL, ..., width = 1,
188+
origin = 0, closed = c("right", "left"),
189+
pad = FALSE) {
190+
closed <- match.arg(closed)
191+
192+
if (all(is.na(x))) {
193+
return(bin_out(length(x), NA, NA, xmin = NA, xmax = NA))
194+
}
195+
196+
stopifnot(is.numeric(width) && length(width) == 1)
197+
stopifnot(is.numeric(origin) && length(origin) == 1)
198+
199+
if (is.null(weight)) {
200+
weight <- rep(1, length(x))
201+
} else {
202+
weight[is.na(weight)] <- 0
137203
}
138204

139-
results <- data.frame(
140-
count = as.numeric(tapply(weight, bins, sum, na.rm = TRUE)),
205+
min_x <- origin
206+
# Small correction factor so that we don't get an extra bin when, for
207+
# example, origin=0, max(x)=20, width=10.
208+
max_x <- max(x, na.rm = TRUE) + (1 - 1e-08) * width
209+
breaks <- seq(min_x, max_x, width)
210+
fuzzybreaks <- adjust_breaks2(breaks, closed = closed)
211+
212+
bins <- cut(x, fuzzybreaks, include.lowest = TRUE, right = (closed == "right"))
213+
214+
left <- breaks[-length(breaks)]
215+
right <- breaks[-1]
216+
x <- (left + right) / 2
217+
bin_widths <- diff(breaks)
218+
219+
count <- as.numeric(tapply(weight, bins, sum, na.rm = TRUE))
220+
count[is.na(count)] <- 0
221+
222+
if (pad) {
223+
count <- c(0, count, 0)
224+
bin_widths <- c(width, bin_widths, width)
225+
x <- c(x[1] - width, x, x[length(x)] + width)
226+
}
227+
228+
# Add row for missings
229+
if (any(is.na(bins))) {
230+
count <- c(count, sum(is.na(bins)))
231+
left <- c(left, NA)
232+
right <- c(right, NA)
233+
x <- c(x, NA)
234+
bin_widths <- c(bin_widths, NA)
235+
}
236+
237+
bin_out(count, x, bin_widths)
238+
}
239+
240+
bin_out <- function(count = integer(0), x = numeric(0), width = numeric(0),
241+
xmin = x - width / 2, xmax = x + width / 2) {
242+
density <- count / width / sum(abs(count))
243+
244+
data.frame(
245+
count = count,
141246
x = x,
142-
width = width
247+
xmin = xmin,
248+
xmax = xmax,
249+
width = width,
250+
density = density,
251+
ncount = count / max(abs(count)),
252+
ndensity = count / max(abs(density)),
253+
stringsAsFactors = FALSE
143254
)
255+
}
144256

145-
if (sum(results$count, na.rm = TRUE) == 0) {
146-
return(results)
147-
}
257+
# Adapt break fuzziness from base::hist - this protects from floating
258+
# point rounding errors
259+
adjust_breaks2 <- function(breaks, closed = "left") {
260+
closed <- match.arg(closed, c("right", "left"))
148261

149-
results$count[is.na(results$count)] <- 0
150-
results$density <- results$count / results$width / sum(abs(results$count), na.rm = TRUE)
151-
results$ncount <- results$count / max(abs(results$count), na.rm = TRUE)
152-
results$ndensity <- results$density / max(abs(results$density), na.rm = TRUE)
153-
if (drop) {
154-
results <- results[results$count > 0, , drop = FALSE]
262+
diddle <- 1e-08 * median(diff(breaks))
263+
if (closed == "right") {
264+
fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1))
265+
} else {
266+
fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle)
155267
}
156-
results
268+
sort(breaks) + fuzz
157269
}

0 commit comments

Comments
 (0)