Skip to content

Commit 3aab640

Browse files
committed
Massive binning refactoring
* New bins class with print method * Three constructors for common use cases
1 parent f7cb898 commit 3aab640

File tree

10 files changed

+198
-169
lines changed

10 files changed

+198
-169
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ Collate:
6565
'annotation.r'
6666
'autoplot.r'
6767
'bench.r'
68+
'bin.R'
6869
'coord-.r'
6970
'coord-cartesian-.r'
7071
'coord-fixed.r'

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ S3method(predictdf,loess)
9191
S3method(print,element)
9292
S3method(print,facet)
9393
S3method(print,ggplot)
94+
S3method(print,ggplot2_bins)
9495
S3method(print,ggproto)
9596
S3method(print,ggproto_method)
9697
S3method(print,rel)

NEWS.md

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,9 @@
7777
defaults to `pad = FALSE` which considerably improves the default limits
7878
for the histogram, especially when the bins are big (#1477).
7979

80-
* The default algorithm does a better job at picking nice width and
80+
* The default algorithm does a better job at picking nice widths and
8181
origins across a wider range of input data.
8282

83-
This change brings with the removal of some rarely used features:
84-
85-
* The `breaks` argument is no longer supported - this was rarely used
86-
and didn't work correctly for frequency polygons.
87-
8883
* `geom_tile()` uses `draw_key_polygon()` for better legend keys, including
8984
coloured outline (#1484).
9085

R/bin.R

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
bins <- function(breaks, closed = c("right", "left"),
2+
fuzz = 1e-08 * median(diff(breaks))) {
3+
stopifnot(is.numeric(breaks))
4+
closed <- match.arg(closed)
5+
6+
breaks <- sort(breaks)
7+
# Adapted base::hist - this protects from floating point rounding errors
8+
if (closed == "right") {
9+
fuzzes <- c(-fuzz, rep.int(fuzz, length(breaks) - 1))
10+
} else {
11+
fuzzes <- c(rep.int(-fuzz, length(breaks) - 1), fuzz)
12+
}
13+
14+
structure(
15+
list(
16+
breaks = breaks,
17+
fuzzy = breaks + fuzzes,
18+
right_closed = closed == "right"
19+
),
20+
class = "ggplot2_bins"
21+
)
22+
}
23+
24+
is_bins <- function(x) inherits(x, "ggplot2_bins")
25+
26+
#' @export
27+
print.ggplot2_bins <- function(x, ...) {
28+
n <- length(x$breaks)
29+
cat("<Bins>\n")
30+
31+
if (x$right_closed) {
32+
left <- c("[", rep("(", n - 2))
33+
right <- rep("]", n - 1)
34+
} else {
35+
left <- rep("[", n - 1)
36+
right <- c(rep(")", n - 2), "]")
37+
}
38+
39+
breaks <- format(x$breaks)
40+
bins <- paste0("* ", left, breaks[-n], ",", breaks[-1], right)
41+
cat(bins, sep = "\n")
42+
cat("\n")
43+
}
44+
45+
# Compute parameters -----------------------------------------------------------
46+
47+
bin_breaks <- function(breaks, closed = c("right", "left")) {
48+
bins(breaks, closed)
49+
}
50+
51+
bin_breaks_width <- function(x_range, width = NULL, center = NULL,
52+
boundary = NULL, closed = c("right", "left")) {
53+
stopifnot(length(x_range) == 2)
54+
55+
# if (length(x_range) == 0) {
56+
# return(bin_params(numeric()))
57+
# }
58+
stopifnot(is.numeric(width), length(width) == 1)
59+
if (width <= 0) {
60+
stop("`binwidth` must be positive", call. = FALSE)
61+
}
62+
63+
if (!is.null(boundary) && !is.null(center)) {
64+
stop("Only one of 'boundary' and 'center' may be specified.")
65+
} else if (is.null(boundary)) {
66+
if (is.null(center)) {
67+
# If neither edge nor center given, compute both using tile layer's
68+
# algorithm. This puts min and max of data in outer half of their bins.
69+
boundary <- width / 2
70+
71+
} else {
72+
# If center given but not boundary, compute boundary.
73+
boundary <- center - width / 2
74+
}
75+
}
76+
77+
# Find the left side of left-most bin: inputs could be Dates or POSIXct, so
78+
# coerce to numeric first.
79+
x_range <- as.numeric(x_range)
80+
width <- as.numeric(width)
81+
boundary <- as.numeric(boundary)
82+
shift <- floor((x_range[1] - boundary) / width)
83+
origin <- boundary + shift * width
84+
85+
# Small correction factor so that we don't get an extra bin when, for
86+
# example, origin = 0, max(x) = 20, width = 10.
87+
max_x <- x_range[2] + (1 - 1e-08) * width
88+
breaks <- seq(origin, max_x, width)
89+
90+
bin_breaks(breaks, closed = closed)
91+
}
92+
93+
bin_breaks_bins <- function(x_range, bins = 30, center = NULL,
94+
boundary = NULL, closed = c("right", "left")) {
95+
stopifnot(length(x_range) == 2)
96+
97+
bins <- as.integer(bins)
98+
if (bins < 1) {
99+
stop("Need at least one bin.", call. = FALSE)
100+
} else if (bins == 1) {
101+
width <- diff(x_range)
102+
boundary <- x_range[1]
103+
} else {
104+
width <- (x_range[2] - x_range[1]) / (bins - 1)
105+
}
106+
107+
bin_breaks_width(x_range, width, boundary = boundary, center = center,
108+
closed = closed)
109+
}
110+
111+
112+
# Compute bins ------------------------------------------------------------
113+
114+
bin_vector <- function(x, bins, weight = NULL, pad = FALSE) {
115+
stopifnot(is_bins(bins))
116+
117+
if (all(is.na(x))) {
118+
return(bin_out(length(x), NA, NA, xmin = NA, xmax = NA))
119+
}
120+
121+
if (is.null(weight)) {
122+
weight <- rep(1, length(x))
123+
} else {
124+
weight[is.na(weight)] <- 0
125+
}
126+
127+
bin_idx <- cut(x, bins$breaks, right = bins$right_closed,
128+
include.lowest = TRUE)
129+
bin_count <- as.numeric(tapply(weight, bin_idx, sum, na.rm = TRUE))
130+
bin_count[is.na(bin_count)] <- 0
131+
132+
bin_x <- (bins$breaks[-length(bins$breaks)] + bins$breaks[-1]) / 2
133+
bin_widths <- diff(bins$breaks)
134+
135+
# Pad row of 0s at start and end
136+
if (pad) {
137+
bin_count <- c(0, bin_count, 0)
138+
139+
width1 <- bin_widths[1]
140+
widthn <- bin_widths[length(bin_widths)]
141+
142+
bin_widths <- c(width1, bin_widths, widthn)
143+
bin_x <- c(bin_x[1] - width1, bin_x, bin_x[length(bin_x)] + widthn)
144+
}
145+
146+
# Add row for missings
147+
if (any(is.na(bins))) {
148+
bin_count <- c(bin_count, sum(is.na(bins)))
149+
bin_widths <- c(bin_widths, NA)
150+
bin_x <- c(bin_x, NA)
151+
}
152+
153+
bin_out(bin_count, bin_x, bin_widths)
154+
}
155+
156+
bin_out <- function(count = integer(0), x = numeric(0), width = numeric(0),
157+
xmin = x - width / 2, xmax = x + width / 2) {
158+
density <- count / width / sum(abs(count))
159+
160+
data.frame(
161+
count = count,
162+
x = x,
163+
xmin = xmin,
164+
xmax = xmax,
165+
width = width,
166+
density = density,
167+
ncount = count / max(abs(count)),
168+
ndensity = count / max(abs(density)),
169+
stringsAsFactors = FALSE
170+
)
171+
}

R/stat-bin.r

Lines changed: 10 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -99,17 +99,13 @@ StatBin <- ggproto("StatBin", Stat,
9999
params$closed <- if (params$right) "right" else "left"
100100
params$right <- NULL
101101
}
102-
if (!is.null(params$breaks)) {
103-
stop("`breaks` is deprecated.", call. = FALSE)
104-
}
105102
if (!is.null(params$width)) {
106103
stop("`width` is deprecated. Do you want `geom_bar()`?", call. = FALSE)
107104
}
108105
if (!is.null(params$boundary) && !is.null(params$center)) {
109106
stop("Only one of `boundary` and `center` may be specified.", call. = FALSE)
110107
}
111108

112-
113109
if (is.null(params$breaks) && is.null(params$binwidth) && is.null(params$bins)) {
114110
message_wrap("`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.")
115111
params$bins <- 30
@@ -126,151 +122,19 @@ StatBin <- ggproto("StatBin", Stat,
126122
breaks = NULL, origin = NULL, right = NULL,
127123
drop = NULL, width = NULL) {
128124

129-
params <- bin_params(
130-
scales$x$dimension(),
131-
width = binwidth,
132-
bins = bins,
133-
center = center,
134-
boundary = boundary,
135-
closed = closed
136-
)
137-
138-
bin_vector(data$x, weight = data$weight, width = params$width,
139-
origin = params$origin, closed = params$closed, pad = pad)
125+
if (!is.null(breaks)) {
126+
bins <- bin_breaks(breaks, closed)
127+
} else if (!is.null(binwidth)) {
128+
bins <- bin_breaks_width(scales$x$dimension(), binwidth, center = center,
129+
boundary = boundary, closed = closed)
130+
} else {
131+
bins <- bin_breaks_bins(scales$x$dimension(), bins, center = center,
132+
boundary = boundary, closed = closed)
133+
}
134+
bin_vector(data$x, bins, weight = data$weight, pad = pad)
140135
},
141136

142137
default_aes = aes(y = ..count..),
143138
required_aes = c("x")
144139
)
145140

146-
147-
# Compute parameters -----------------------------------------------------------
148-
149-
bin_params <- function(x_range, width = NULL, bins = 30, center = NULL,
150-
boundary = NULL, closed = c("right", "left")) {
151-
closed <- match.arg(closed)
152-
153-
if (length(x_range) == 0) {
154-
return(list(width = width, origin = NULL, closed = closed))
155-
}
156-
157-
stopifnot(length(x_range) == 2)
158-
if (!is.null(boundary) && !is.null(center)) {
159-
stop("Only one of 'boundary' and 'center' may be specified.")
160-
}
161-
162-
if (is.null(width)) {
163-
width <- (x_range[2] - x_range[1]) / (bins - 1)
164-
}
165-
166-
if (is.null(boundary)) {
167-
if (is.null(center)) {
168-
# If neither edge nor center given, compute both using tile layer's
169-
# algorithm. This puts min and max of data in outer half of their bins.
170-
boundary <- width / 2
171-
172-
} else {
173-
# If center given but not boundary, compute boundary.
174-
boundary <- center - width / 2
175-
}
176-
}
177-
178-
# Inputs could be Dates or POSIXct, so make sure these are all numeric
179-
x_range <- as.numeric(x_range)
180-
width <- as.numeric(width)
181-
boundary <- as.numeric(boundary)
182-
183-
origin <- find_origin(x_range, width, boundary)
184-
185-
list(width = width, origin = origin, closed = closed)
186-
}
187-
188-
# Find the left side of left-most bin
189-
find_origin <- function(x_range, width, boundary) {
190-
shift <- floor((x_range[1] - boundary) / width)
191-
boundary + shift * width
192-
}
193-
194-
bin_vector <- function(x, weight = NULL, ..., width = 1,
195-
origin = 0, closed = c("right", "left"),
196-
pad = FALSE) {
197-
closed <- match.arg(closed)
198-
199-
if (all(is.na(x))) {
200-
return(bin_out(length(x), NA, NA, xmin = NA, xmax = NA))
201-
}
202-
203-
stopifnot(is.numeric(width) && length(width) == 1)
204-
stopifnot(is.numeric(origin) && length(origin) == 1)
205-
206-
if (is.null(weight)) {
207-
weight <- rep(1, length(x))
208-
} else {
209-
weight[is.na(weight)] <- 0
210-
}
211-
212-
min_x <- origin
213-
# Small correction factor so that we don't get an extra bin when, for
214-
# example, origin=0, max(x)=20, width=10.
215-
max_x <- max(x, na.rm = TRUE) + (1 - 1e-08) * width
216-
breaks <- seq(min_x, max_x, width)
217-
fuzzybreaks <- adjust_breaks2(breaks, closed = closed)
218-
219-
bins <- cut(x, fuzzybreaks, include.lowest = TRUE, right = (closed == "right"))
220-
221-
left <- breaks[-length(breaks)]
222-
right <- breaks[-1]
223-
x <- (left + right) / 2
224-
bin_widths <- diff(breaks)
225-
226-
count <- as.numeric(tapply(weight, bins, sum, na.rm = TRUE))
227-
count[is.na(count)] <- 0
228-
229-
if (pad) {
230-
count <- c(0, count, 0)
231-
bin_widths <- c(width, bin_widths, width)
232-
x <- c(x[1] - width, x, x[length(x)] + width)
233-
}
234-
235-
# Add row for missings
236-
if (any(is.na(bins))) {
237-
count <- c(count, sum(is.na(bins)))
238-
left <- c(left, NA)
239-
right <- c(right, NA)
240-
x <- c(x, NA)
241-
bin_widths <- c(bin_widths, NA)
242-
}
243-
244-
bin_out(count, x, bin_widths)
245-
}
246-
247-
bin_out <- function(count = integer(0), x = numeric(0), width = numeric(0),
248-
xmin = x - width / 2, xmax = x + width / 2) {
249-
density <- count / width / sum(abs(count))
250-
251-
data.frame(
252-
count = count,
253-
x = x,
254-
xmin = xmin,
255-
xmax = xmax,
256-
width = width,
257-
density = density,
258-
ncount = count / max(abs(count)),
259-
ndensity = count / max(abs(density)),
260-
stringsAsFactors = FALSE
261-
)
262-
}
263-
264-
# Adapt break fuzziness from base::hist - this protects from floating
265-
# point rounding errors
266-
adjust_breaks2 <- function(breaks, closed = "left") {
267-
closed <- match.arg(closed, c("right", "left"))
268-
269-
diddle <- 1e-08 * median(diff(breaks))
270-
if (closed == "right") {
271-
fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1))
272-
} else {
273-
fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle)
274-
}
275-
sort(breaks) + fuzz
276-
}

0 commit comments

Comments
 (0)