mcse_dotplots.Rmd

---
title: "Quantile dotplots with Monte Carlo error"
output: github_document
---

## Introduction

This is an attempt to play with adding Monte Carlo standard errors to quantile
dotplots by "blurring" (sort of) each quantile according to its error.

## Setup

```{r setup, message = FALSE, warning = FALSE}
library(tidyverse)
library(ggplot2)
library(posterior)   # remotes::install_github("stan-dev/posterior")
library(ggdist)
library(patchwork)
library(distributional)

theme_set(theme_ggdist())

knitr::opts_chunk$set(
  dev.args = list(png = list(type = "cairo"))
)
```

## The problem

[Quantile dotplots](https://github.com/mjskay/when-ish-is-my-bus/blob/master/quantile-dotplots.md)
are a useful way to summarize a distribution by plotting a number of
quantiles (often 20, 50, or 100). However, when obtaining samples from Bayesian 
posterior distributions using Monte Carlo methods, there is uncertainty not
captured purely by looking at the posterior quantiles: the quantiles themselves
have error associated with them due to the Monte Carlo sampling process.

The (currently experimental) [posterior](https://mc-stan.org/posterior/) package allows calculation of this
error using [`posterior::mcse_quantile()`](https://mc-stan.org/posterior/reference/mcse_quantile.html),
but it is hard to show **two levels** of uncertainty at the same time.

This is one attempt to do that, by showing both 50 quantiles from a sample from
a distribution *and* the uncertainty in those quantiles. Here I am "blurring"
each quantile by overlaying 20 semitransparent quantiles of each quantile taken
according to the Monte Carlo standard error of each quantile:

```{r mcse_dotplot, fig.width = 5, fig.height = 10}
n_dot = 50 # number of dots (quantiles) of the sample to take
n_rep = 20 # number of replicates (quantiles) of each dot to create "blur"

annotation_color = "gray25"

scales = function(n) list(
  labs(
    y = NULL,
    x = NULL,
    subtitle = paste0("Sample size = ", n)
  ),
  scale_x_continuous(limits = c(-3.5, 3.5)),
  scale_y_continuous(breaks = NULL)
)

mcse_dotplot = function(n = 500, annotate_mcse = TRUE) {
  set.seed(1234)
  x = rnorm(n)
  
  df = tibble(
      dot = 1:n_dot,
      q_mean = quantile(x, probs = ppoints(n_dot)),
      q_se = mcse_quantile(x, ppoints(n_dot), names = FALSE)
    ) %>%
    group_by(dot) %>%
    summarise(.groups = "drop",
      rep = 1:n_rep,
      q = qnorm(ppoints(n_rep), q_mean, q_se)
    )
  
  df %>%
    ggplot(aes(y = 0, x = q, group = rep)) +
    stat_dots(layout = "bin", alpha = 1/n_rep, color = NA) +
    (if (annotate_mcse) list(
      geom_line(
        aes(group = NA), y = 0.14, data = df %>% filter(dot == 1, rep %in% range(rep)),
        color = annotation_color
      ),
      geom_text(
        y = 0.18, vjust = 0, label = "uncertainty in\nthe 1st %ile\ndue to Monte\nCarlo error",
        data = df %>% filter(dot == 1, rep == floor(n_rep/2)),
        lineheight = 1,
        color = annotation_color,
        # fontface = "bold",
        size = 3.5
      )
    )) +
    scales(n)
}

theoretical_plot = tibble(mean = 0) %>%
  ggplot(aes(y = 0, dist = dist_normal(mean, 1))) +
  stat_dist_dots(color = NA, quantiles = n_dot, fill = "gray75") +
  scales("infinity")


mcse_dotplot(annotate_mcse = TRUE) /
  mcse_dotplot(n = 1000, annotate_mcse = FALSE) /
  mcse_dotplot(n = 5000, annotate_mcse = FALSE) /
  theoretical_plot +
  plot_annotation(
    title = "Dotplots of 50 quantiles of samples from N(0,1)",
    subtitle = "Dots are 'blurred' according to the Monte Carlo error of that quantile", 
  )
```

## Histogram version

A histogram version based on a [suggestion from Dan Simpson](https://twitter.com/dan_p_simpson/status/1401051206009561090?s=20).

We'll bin percentiles using a rolling window based on which quantiles
are within 2 SEs of each other:

```{r mcse_histogram, fig.width = 15, fig.height = 10}
mcse_histogram = function(
  n = 500, 
  n_max_bin = 100 # maximum number of bins (quantiles)
) {

  set.seed(1234)
  x = rnorm(n)
  
  se_mult = abs(qnorm(1/n_max_bin/2))
  probs = ppoints(n_max_bin, a = 0.5)
  
  df = tibble(
      q_mean = quantile(x, probs = probs),
      q_se = mcse_quantile(x, probs, names = FALSE)
    )
  
  if (all(is.na(df$q_se))) {
    df$q_se = 1
  } else {
    df$q_se[is.na(df$q_se)] = min(df$q_se, na.rm = TRUE)
  }
  
  # bin the quantiles based on which ones are within 2 SEs of each other
  next_split = -Inf
  bin = 0
  df$bin = sapply(seq_len(nrow(df)), function(i) {
    if (df$q_mean[[i]] > next_split) {
      next_split <<- with(df[i,], q_mean + se_mult * q_se)
      bin <<- bin + 1
    }
    bin
  })
  
  # Then we'll figure out non-overlapping boundaries between bins:
  df_bins = df %>%
    mutate(
      lower = q_mean - se_mult * q_se,
      upper = q_mean + se_mult * q_se,
    ) %>%
    group_by(bin) %>%
    summarise(
      lower = min(lower),
      upper = max(upper)
    ) %>%
    mutate(
      adj_lower = rowMeans(cbind(lag(upper), lower), na.rm = TRUE),
      adj_upper = rowMeans(cbind(upper, lead(lower)), na.rm = TRUE)
    )
  
  # hack to make sure data is covered
  df_bins$adj_lower[[1]] = min(x, df_bins$adj_lower[[1]])
  df_bins$adj_upper[[nrow(df_bins)]] = max(x, df_bins$adj_upper[[nrow(df_bins)]]) + .Machine$double.eps
  
  # Now we'll count how many data points in the sample are actually in these
  # bins to determine (1) densities and (2) actual bin boundaries:
  df_bins_densities = df_bins %>%
    ungroup() %>%
    mutate(
      data = map2(adj_lower, adj_upper, ~ x[.x <= x & x < .y]),
      n_in_bin = lengths(data),
      p = n_in_bin / n(),
      # suppress warnings for min/max if bin is empty
      lower = suppressWarnings(sapply(data, min)),
      upper = suppressWarnings(sapply(data, max))
    ) %>%
    mutate(
      # make sure bin edges touch
      adj_lower = rowMeans(cbind(lag(upper), lower), na.rm = TRUE),
      adj_upper = rowMeans(cbind(upper, lead(lower)), na.rm = TRUE),
      density = p / (adj_upper - adj_lower)
    )
  
  tibble(
    # interleave lower and upper to draw histograms edges
    x = with(df_bins_densities, as.vector(rbind(adj_lower, adj_lower, adj_upper, adj_upper))),
    density = with(df_bins_densities, as.vector(rbind(0, density, density, 0)))
  ) %>%
    ggplot(aes(x = x)) +
    geom_ribbon(aes(ymin = 0, ymax = density), color = "gray75") +
    geom_rug(data = data.frame(x)) +
    # geom_function(fun = dnorm, color = "red") +
    labs(
      y = NULL,
      x = NULL,
      subtitle = paste0("Sample size = ", n, ", max bins = ", n_max_bin)
    ) +
    coord_cartesian(xlim = c(-3.5, 3.5)) +
    scale_y_continuous(breaks = NULL)
}


row = function(n) (mcse_histogram(n) + mcse_histogram(n, 50) + mcse_histogram(n, 20))

row(500) /
  row(1000) /
  row(5000) /
  row(10000) +
  plot_annotation(
    title = "Histograms of samples from N(0,1)",
    subtitle = paste0("Binned to keep quantiles together if they are within 2*MCSE"), 
  )
  
  
```


```{r}
n = 500
n_max_bin = 100

set.seed(1234)
# x = rbeta(n, 1, 8)
x = rnorm(n)

mcse_quantile_ = function(x, probs) {
  se = mcse_quantile(x, probs, names = FALSE)
  ifelse(is.na(se), 0, se)
}

se_mult = 2#abs(qnorm(1/20/2))
probs = ppoints(n_max_bin, a = 0.9)
qs = quantile(x, probs = probs)
ses = mcse_quantile_(x, probs)

current_prob = probs[[1]]
current_q = qs[[1]]
current_se = ses[[1]]
current_n = 1
selected_qs = double()
selected_ses = double()
for (i in seq_along(probs)[-1]) {
  # diff_se = sqrt(current_se^2 + ses[[i]]^2)
  # if ((qs[[i]] - current_q)/diff_se > se_mult) {
  if (current_q + se_mult*current_se < qs[[i]] - se_mult*ses[[i]]) {
    # new split
    new_i = length(selected_qs) + 1
    selected_qs[[new_i]] = current_q
    selected_ses[[new_i]] = current_se
    current_prob = probs[[i]]
    current_q = qs[[i]]
    current_se = ses[[i]]
    current_n = 1
  } else {
    # no split => merge quantiles and calculate new se
    current_prob = (current_prob * current_n + probs[[i]]) / (current_n + 1)
    current_q = quantile(x, current_prob)
    current_se = mcse_quantile_(x, current_prob)
    current_n = current_n + 1
  }
}
# final split
new_i = length(selected_qs) + 1
selected_qs[[new_i]] = current_q
selected_ses[[new_i]] = current_se


# Then we'll figure out non-overlapping boundaries between bins:
lower = selected_qs - se_mult * selected_ses
upper = selected_qs + se_mult * selected_ses
adj_lower = rowMeans(cbind(lag(upper), lower), na.rm = TRUE)
adj_upper = rowMeans(cbind(upper, lead(lower)), na.rm = TRUE)

# hack to make sure data is covered
adj_lower[[1]] = min(x, adj_lower)
adj_upper[[length(adj_upper)]] = max(x, adj_upper) + .Machine$double.eps * 2


# Now we'll count how many data points in the sample are actually in these
# bins to determine (1) densities and (2) actual bin boundaries:
data_in_bin = map2(adj_lower, adj_upper, ~ x[.x <= x & x < .y])
n_in_bin = lengths(data_in_bin)
# suppress warnings for min/max if bin is empty
lower = suppressWarnings(sapply(data_in_bin, min))
upper = suppressWarnings(sapply(data_in_bin, max))
# make sure bin edges touch
adj_lower = rowMeans(cbind(lag(upper), lower), na.rm = TRUE)
adj_upper = rowMeans(cbind(upper, lead(lower)), na.rm = TRUE)
# make sure bin edges are finite
adj_lower = ifelse(adj_lower == -Inf, min(x), adj_lower)
adj_upper = ifelse(adj_lower == Inf, max(x), adj_upper)
density = n_in_bin / (adj_upper - adj_lower) / n

hist_plot = tibble(
  # interleave lower and upper to draw histograms edges
  x = as.vector(rbind(adj_lower, adj_lower, adj_upper, adj_upper)),
  density = as.vector(rbind(0, density, density, 0))
  # x = as.vector(rbind(adj_lower, adj_upper)),
  # density = as.vector(rbind(density, density))
) %>%
  ggplot(aes(x = x)) +
  geom_ribbon(aes(ymin = 0, ymax = density), color = "gray75") +
  geom_rug(data = data.frame(x)) +
  # geom_function(fun = ~ dbeta(.x, 1, 8), color = "red") +
  geom_function(fun = dnorm, color = "red") +
  labs(
    y = NULL,
    x = NULL,
    subtitle = paste0("Sample size = ", n, ", max bins = ", n_max_bin)
  ) +
  coord_cartesian(xlim = c(-3.5, 3.5)) +
  # coord_cartesian(xlim = c(0, 1)) +
  scale_y_continuous(breaks = NULL)

q_plot = tibble(
    i = seq_along(selected_qs), 
    q = selected_qs, 
    se = selected_ses,
  ) %>% 
  ggplot(aes(x = q)) + 
  geom_segment(aes(y = 0, yend = max(i) + 1, xend = q), color = "gray75") +
  geom_linerange(aes(y = i, xmin = q - se_mult*se, xmax = q + se_mult*se), size = 0.5) +
  geom_rug(data = data.frame(q = qs)) +
  coord_cartesian(xlim = c(-3.5, 3.5))
  # coord_cartesian(xlim = c(0, 1))

hist_plot / q_plot
```