start work on compare_pairs

amrcode1 · Nov 2, 2017 · 28b2504 · 28b2504
1 parent a950b18
commit 28b2504
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 23 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -18,6 +18,7 @@ export(sym)
 export(syms)
 export(tidy_correlation)
 export(tidy_quantile)
+import(dplyr)
 importFrom(magrittr,"%>%")
 importFrom(rlang,enexpr)
 importFrom(rlang,enquo)
@@ -29,3 +30,4 @@ importFrom(rlang,quos)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
 importFrom(stats,quantile)
+importFrom(utils,modifyList)
diff --git a/R/tidy-stats.R b/R/tidy-stats.R
@@ -32,20 +32,20 @@ tidy_quantile.default <- function(data, var, probs = seq(.1, .9, .2)) {
 tidy_quantile.grouped_df <- function(data, var, probs = seq(.1, .9, .2)) {
   q <- enquo(var)
 
-  groups <- split(data, dplyr::group_indices(data)) %>%
-    lapply(dplyr::select, !!! dplyr::group_vars(data)) %>%
-    lapply(dplyr::distinct) %>%
-    lapply(dplyr::ungroup) %>%
-    dplyr::bind_rows(.id = "....id")
+  groups <- split(data, group_indices(data)) %>%
+    lapply(select, !!! group_vars(data)) %>%
+    lapply(distinct) %>%
+    lapply(ungroup) %>%
+    bind_rows(.id = "....id")
 
-  quantiles <- split(data, dplyr::group_indices(data)) %>%
-    lapply(dplyr::ungroup) %>%
+  quantiles <- split(data, group_indices(data)) %>%
+    lapply(ungroup) %>%
     lapply(tidy_quantile.default, !! q, probs) %>%
-    dplyr::bind_rows(.id = "....id")
+    bind_rows(.id = "....id")
 
   groups %>%
-    dplyr::left_join(quantiles, by = "....id") %>%
-    dplyr::select(-dplyr::one_of("....id"))
+    left_join(quantiles, by = "....id") %>%
+    select(-one_of("....id"))
 }
 
 
@@ -77,19 +77,19 @@ tidy_correlation <- function(data, ..., type = c("pearson", "spearman")) {
 #' @export
 tidy_correlation.grouped_df <- function(data, ..., type = c("pearson", "spearman")) {
   data %>%
-    dplyr::do(tidy_correlation.default(., ..., type = type)) %>%
-    dplyr::ungroup()
+    do(tidy_correlation.default(.data, ..., type = type)) %>%
+    ungroup()
 }
 
 #' @export
 tidy_correlation.default <- function(data, ..., type = c("pearson", "spearman")) {
-  dplyr::select(data, ...) %>%
+  select(data, ...) %>%
     as.matrix() %>%
     Hmisc::rcorr(type = type) %>%
     broom::tidy() %>%
     tibble::remove_rownames() %>%
     tibble::as_tibble() %>%
-    dplyr::mutate_at(c("column1", "column2"), as.character) %>%
-    dplyr::mutate_if(is.numeric, round, 4)
+    mutate_at(c("column1", "column2"), as.character) %>%
+    mutate_if(is.numeric, round, 4)
 }
 
diff --git a/R/tjmisc-package.R b/R/tjmisc-package.R
@@ -1,9 +1,28 @@
 #' @keywords internal
+#' @import dplyr
+#' @importFrom utils modifyList
 "_PACKAGE"
 
+# This is where I put as-yet unsupported helpers.
 
 
 
+
+
+
+
+wrap_with_defaults <- function(func, hard_defaults, soft_defaults) {
+  soft_defaults <- force(soft_defaults)
+  hard_defaults <- force(hard_defaults)
+  function(...) {
+    dots <- list(...)
+    # overwrite soft defaults with user options
+    # then overwrite with hard defaults
+    args <- modifyList(modifyList(soft_defaults, dots), hard_defaults)
+    do.call(func, args)
+  }
+}
+
 #' Create a sequence along the rows of a dataframe
 #' @param data a dataframe
 #' @return a sequence of integers along the rows of a dataframe
@@ -13,12 +32,12 @@ seq_along_rows <- function(data) {
 }
 
 
-fct_add_counts <- function(f) {
-  counts <- forcats::fct_count(f)
-  counts[["new"]] <- sprintf("%s (%s)", counts[["f"]], counts[["n"]])
-  x <- setNames(counts[["new"]], counts[["f"]])
-  forcats::fct_relabel(f, function(level) x[level])
-}
+# fct_add_counts <- function(f) {
+#   counts <- forcats::fct_count(f)
+#   counts[["new"]] <- sprintf("%s (%s)", counts[["f"]], counts[["n"]])
+#   x <- setNames(counts[["new"]], counts[["f"]])
+#   forcats::fct_relabel(f, function(level) x[level])
+# }
 
 
 #' Resequence a set of integer indices
@@ -37,5 +56,5 @@ fct_add_counts <- function(f) {
 resequence <- function(xs) {
   keys <- sort(unique(xs))
   values <- seq_along(keys)
-  unname(setNames(values, keys)[as.character(xs)])
+  unname(rlang::set_names(values, keys)[as.character(xs)])
 }
diff --git a/README.Rmd b/README.Rmd
@@ -91,11 +91,75 @@ iris %>%
   tidy_correlation(dplyr::starts_with("Petal"))
 ```
 
+<!-- ### Pairwise comparisons -->
+
+<!-- `compare_pairs()` compares all pairs of values among levels of a categorical -->
+<!-- variable. Hmmm, that sounds confusing. Here's an example. We compute the -->
+<!-- difference in average score between each pair of workers. -->
+
+<!-- ```{r} -->
+<!-- to_compare <- nlme::Machines %>% -->
+<!--   group_by(Worker) %>% -->
+<!--   summarise(avg_score = mean(score)) %>% -->
+<!--   print() -->
+
+<!-- to_compare %>% -->
+<!--   compare_pairs(Worker, avg_score) %>% -->
+<!--   rename(difference = value) %>% -->
+<!--   mutate_if(is.numeric, round, 1) -->
+<!-- ``` -->
+
+<!-- I use it to compute posterior differences in Bayesian models. For example, let's -->
+<!-- fit a Bayesian model of average sepal length for each species in `iris`. -->
+
+<!-- ```{r, results = "hide"} -->
+<!-- library(rstanarm) -->
+<!-- m <- stan_glm( -->
+<!--   Sepal.Length ~ Species - 1, -->
+<!--   iris, -->
+<!--   family = gaussian, -->
+<!--   prior = normal(0, 1), -->
+<!--   prior_intercept = normal(0, 1)) -->
+<!-- ``` -->
+
+<!-- Now, we have a posterior distributions of species means. -->
+
+<!-- ```{r} -->
+<!-- newdata <- data.frame(Species = unique(iris$Species)) -->
+
+<!-- p_means <- posterior_linpred(m, newdata = newdata) %>% -->
+<!--   as.data.frame() %>% -->
+<!--   tibble::as_tibble() %>% -->
+<!--   setNames(newdata$Species) %>% -->
+<!--   tibble::rowid_to_column("draw") %>% -->
+<!--   tidyr::gather(species, mean, -draw) %>% -->
+<!--   print() -->
+<!-- ``` -->
+
+<!-- For each posterior sample, we can compute pairwise differences of means. -->
+
+<!-- ```{r pairs, fig.width = 4, fig.height = 2.5} -->
+<!-- pair_diffs <- compare_pairs(data, species, mean) %>% -->
+<!--   print() -->
+
+<!-- library(ggplot2) -->
+
+<!-- ggplot(pair_diffs) + -->
+<!--   aes(x = pair, y = value) + -->
+<!--   stat_summary(fun.data = median_hilow, geom = "linerange") + -->
+<!--   stat_summary(fun.data = median_hilow, fun.args = list(conf.int = .8), -->
+<!--                size = 2, geom = "linerange") + -->
+<!--   stat_summary(fun.y = median, size = 5, shape = 3, geom = "point") + -->
+<!--   labs(x = NULL, y = "Difference in posterior means") + -->
+<!--   coord_flip() -->
+<!-- ``` -->
+
 
 ### Et cetera
 
 `ggpreview()` is like ggplot2's `ggsave()` but it saves an image to a temporary
 file and then opens it in the system viewer. If you've ever found yourself in
 a loop of saving a plot, leaving RStudio to doubleclick the file, sighing, going
 back to RStudio, tweaking the height or width or plot theme, ever so slowly
-spiraling in on your desired plot, then `ggpreview()` is for you.
+spiraling in on your desired plot, then `ggpreview()` is for you.
+
diff --git a/README.md b/README.md
@@ -146,6 +146,56 @@ iris %>%
 #> 3  virginica Petal.Length Petal.Width   0.3221    50  0.0225
 ```
 
+<!-- ### Pairwise comparisons -->
+<!-- `compare_pairs()` compares all pairs of values among levels of a categorical -->
+<!-- variable. Hmmm, that sounds confusing. Here's an example. We compute the -->
+<!-- difference in average score between each pair of workers. -->
+<!-- ```{r} -->
+<!-- to_compare <- nlme::Machines %>% -->
+<!--   group_by(Worker) %>% -->
+<!--   summarise(avg_score = mean(score)) %>% -->
+<!--   print() -->
+<!-- to_compare %>% -->
+<!--   compare_pairs(Worker, avg_score) %>% -->
+<!--   rename(difference = value) %>% -->
+<!--   mutate_if(is.numeric, round, 1) -->
+<!-- ``` -->
+<!-- I use it to compute posterior differences in Bayesian models. For example, let's -->
+<!-- fit a Bayesian model of average sepal length for each species in `iris`. -->
+<!-- ```{r, results = "hide"} -->
+<!-- library(rstanarm) -->
+<!-- m <- stan_glm( -->
+<!--   Sepal.Length ~ Species - 1, -->
+<!--   iris, -->
+<!--   family = gaussian, -->
+<!--   prior = normal(0, 1), -->
+<!--   prior_intercept = normal(0, 1)) -->
+<!-- ``` -->
+<!-- Now, we have a posterior distributions of species means. -->
+<!-- ```{r} -->
+<!-- newdata <- data.frame(Species = unique(iris$Species)) -->
+<!-- p_means <- posterior_linpred(m, newdata = newdata) %>% -->
+<!--   as.data.frame() %>% -->
+<!--   tibble::as_tibble() %>% -->
+<!--   setNames(newdata$Species) %>% -->
+<!--   tibble::rowid_to_column("draw") %>% -->
+<!--   tidyr::gather(species, mean, -draw) %>% -->
+<!--   print() -->
+<!-- ``` -->
+<!-- For each posterior sample, we can compute pairwise differences of means. -->
+<!-- ```{r pairs, fig.width = 4, fig.height = 2.5} -->
+<!-- pair_diffs <- compare_pairs(data, species, mean) %>% -->
+<!--   print() -->
+<!-- library(ggplot2) -->
+<!-- ggplot(pair_diffs) + -->
+<!--   aes(x = pair, y = value) + -->
+<!--   stat_summary(fun.data = median_hilow, geom = "linerange") + -->
+<!--   stat_summary(fun.data = median_hilow, fun.args = list(conf.int = .8), -->
+<!--                size = 2, geom = "linerange") + -->
+<!--   stat_summary(fun.y = median, size = 5, shape = 3, geom = "point") + -->
+<!--   labs(x = NULL, y = "Difference in posterior means") + -->
+<!--   coord_flip() -->
+<!-- ``` -->
 ### Et cetera
 
 `ggpreview()` is like ggplot2's `ggsave()` but it saves an image to a temporary file and then opens it in the system viewer. If you've ever found yourself in a loop of saving a plot, leaving RStudio to doubleclick the file, sighing, going back to RStudio, tweaking the height or width or plot theme, ever so slowly spiraling in on your desired plot, then `ggpreview()` is for you.