Skip to content

Commit

Permalink
start work on compare_pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
tjmahr committed Nov 2, 2017
1 parent a950b18 commit 28b2504
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 23 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export(sym)
export(syms)
export(tidy_correlation)
export(tidy_quantile)
import(dplyr)
importFrom(magrittr,"%>%")
importFrom(rlang,enexpr)
importFrom(rlang,enquo)
Expand All @@ -29,3 +30,4 @@ importFrom(rlang,quos)
importFrom(rlang,sym)
importFrom(rlang,syms)
importFrom(stats,quantile)
importFrom(utils,modifyList)
30 changes: 15 additions & 15 deletions R/tidy-stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,20 @@ tidy_quantile.default <- function(data, var, probs = seq(.1, .9, .2)) {
tidy_quantile.grouped_df <- function(data, var, probs = seq(.1, .9, .2)) {
q <- enquo(var)

groups <- split(data, dplyr::group_indices(data)) %>%
lapply(dplyr::select, !!! dplyr::group_vars(data)) %>%
lapply(dplyr::distinct) %>%
lapply(dplyr::ungroup) %>%
dplyr::bind_rows(.id = "....id")
groups <- split(data, group_indices(data)) %>%
lapply(select, !!! group_vars(data)) %>%
lapply(distinct) %>%
lapply(ungroup) %>%
bind_rows(.id = "....id")

quantiles <- split(data, dplyr::group_indices(data)) %>%
lapply(dplyr::ungroup) %>%
quantiles <- split(data, group_indices(data)) %>%
lapply(ungroup) %>%
lapply(tidy_quantile.default, !! q, probs) %>%
dplyr::bind_rows(.id = "....id")
bind_rows(.id = "....id")

groups %>%
dplyr::left_join(quantiles, by = "....id") %>%
dplyr::select(-dplyr::one_of("....id"))
left_join(quantiles, by = "....id") %>%
select(-one_of("....id"))
}


Expand Down Expand Up @@ -77,19 +77,19 @@ tidy_correlation <- function(data, ..., type = c("pearson", "spearman")) {
#' @export
tidy_correlation.grouped_df <- function(data, ..., type = c("pearson", "spearman")) {
data %>%
dplyr::do(tidy_correlation.default(., ..., type = type)) %>%
dplyr::ungroup()
do(tidy_correlation.default(.data, ..., type = type)) %>%
ungroup()
}

#' @export
tidy_correlation.default <- function(data, ..., type = c("pearson", "spearman")) {
dplyr::select(data, ...) %>%
select(data, ...) %>%
as.matrix() %>%
Hmisc::rcorr(type = type) %>%
broom::tidy() %>%
tibble::remove_rownames() %>%
tibble::as_tibble() %>%
dplyr::mutate_at(c("column1", "column2"), as.character) %>%
dplyr::mutate_if(is.numeric, round, 4)
mutate_at(c("column1", "column2"), as.character) %>%
mutate_if(is.numeric, round, 4)
}

33 changes: 26 additions & 7 deletions R/tjmisc-package.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,28 @@
#' @keywords internal
#' @import dplyr
#' @importFrom utils modifyList
"_PACKAGE"

# This is where I put as-yet unsupported helpers.







wrap_with_defaults <- function(func, hard_defaults, soft_defaults) {
soft_defaults <- force(soft_defaults)
hard_defaults <- force(hard_defaults)
function(...) {
dots <- list(...)
# overwrite soft defaults with user options
# then overwrite with hard defaults
args <- modifyList(modifyList(soft_defaults, dots), hard_defaults)
do.call(func, args)
}
}

#' Create a sequence along the rows of a dataframe
#' @param data a dataframe
#' @return a sequence of integers along the rows of a dataframe
Expand All @@ -13,12 +32,12 @@ seq_along_rows <- function(data) {
}


fct_add_counts <- function(f) {
counts <- forcats::fct_count(f)
counts[["new"]] <- sprintf("%s (%s)", counts[["f"]], counts[["n"]])
x <- setNames(counts[["new"]], counts[["f"]])
forcats::fct_relabel(f, function(level) x[level])
}
# fct_add_counts <- function(f) {
# counts <- forcats::fct_count(f)
# counts[["new"]] <- sprintf("%s (%s)", counts[["f"]], counts[["n"]])
# x <- setNames(counts[["new"]], counts[["f"]])
# forcats::fct_relabel(f, function(level) x[level])
# }


#' Resequence a set of integer indices
Expand All @@ -37,5 +56,5 @@ fct_add_counts <- function(f) {
resequence <- function(xs) {
keys <- sort(unique(xs))
values <- seq_along(keys)
unname(setNames(values, keys)[as.character(xs)])
unname(rlang::set_names(values, keys)[as.character(xs)])
}
66 changes: 65 additions & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,75 @@ iris %>%
tidy_correlation(dplyr::starts_with("Petal"))
```

<!-- ### Pairwise comparisons -->

<!-- `compare_pairs()` compares all pairs of values among levels of a categorical -->
<!-- variable. Hmmm, that sounds confusing. Here's an example. We compute the -->
<!-- difference in average score between each pair of workers. -->

<!-- ```{r} -->
<!-- to_compare <- nlme::Machines %>% -->
<!-- group_by(Worker) %>% -->
<!-- summarise(avg_score = mean(score)) %>% -->
<!-- print() -->

<!-- to_compare %>% -->
<!-- compare_pairs(Worker, avg_score) %>% -->
<!-- rename(difference = value) %>% -->
<!-- mutate_if(is.numeric, round, 1) -->
<!-- ``` -->

<!-- I use it to compute posterior differences in Bayesian models. For example, let's -->
<!-- fit a Bayesian model of average sepal length for each species in `iris`. -->

<!-- ```{r, results = "hide"} -->
<!-- library(rstanarm) -->
<!-- m <- stan_glm( -->
<!-- Sepal.Length ~ Species - 1, -->
<!-- iris, -->
<!-- family = gaussian, -->
<!-- prior = normal(0, 1), -->
<!-- prior_intercept = normal(0, 1)) -->
<!-- ``` -->

<!-- Now, we have a posterior distributions of species means. -->

<!-- ```{r} -->
<!-- newdata <- data.frame(Species = unique(iris$Species)) -->

<!-- p_means <- posterior_linpred(m, newdata = newdata) %>% -->
<!-- as.data.frame() %>% -->
<!-- tibble::as_tibble() %>% -->
<!-- setNames(newdata$Species) %>% -->
<!-- tibble::rowid_to_column("draw") %>% -->
<!-- tidyr::gather(species, mean, -draw) %>% -->
<!-- print() -->
<!-- ``` -->

<!-- For each posterior sample, we can compute pairwise differences of means. -->

<!-- ```{r pairs, fig.width = 4, fig.height = 2.5} -->
<!-- pair_diffs <- compare_pairs(data, species, mean) %>% -->
<!-- print() -->

<!-- library(ggplot2) -->

<!-- ggplot(pair_diffs) + -->
<!-- aes(x = pair, y = value) + -->
<!-- stat_summary(fun.data = median_hilow, geom = "linerange") + -->
<!-- stat_summary(fun.data = median_hilow, fun.args = list(conf.int = .8), -->
<!-- size = 2, geom = "linerange") + -->
<!-- stat_summary(fun.y = median, size = 5, shape = 3, geom = "point") + -->
<!-- labs(x = NULL, y = "Difference in posterior means") + -->
<!-- coord_flip() -->
<!-- ``` -->


### Et cetera

`ggpreview()` is like ggplot2's `ggsave()` but it saves an image to a temporary
file and then opens it in the system viewer. If you've ever found yourself in
a loop of saving a plot, leaving RStudio to doubleclick the file, sighing, going
back to RStudio, tweaking the height or width or plot theme, ever so slowly
spiraling in on your desired plot, then `ggpreview()` is for you.
spiraling in on your desired plot, then `ggpreview()` is for you.

50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,56 @@ iris %>%
#> 3 virginica Petal.Length Petal.Width 0.3221 50 0.0225
```

<!-- ### Pairwise comparisons -->
<!-- `compare_pairs()` compares all pairs of values among levels of a categorical -->
<!-- variable. Hmmm, that sounds confusing. Here's an example. We compute the -->
<!-- difference in average score between each pair of workers. -->
<!-- ```{r} -->
<!-- to_compare <- nlme::Machines %>% -->
<!-- group_by(Worker) %>% -->
<!-- summarise(avg_score = mean(score)) %>% -->
<!-- print() -->
<!-- to_compare %>% -->
<!-- compare_pairs(Worker, avg_score) %>% -->
<!-- rename(difference = value) %>% -->
<!-- mutate_if(is.numeric, round, 1) -->
<!-- ``` -->
<!-- I use it to compute posterior differences in Bayesian models. For example, let's -->
<!-- fit a Bayesian model of average sepal length for each species in `iris`. -->
<!-- ```{r, results = "hide"} -->
<!-- library(rstanarm) -->
<!-- m <- stan_glm( -->
<!-- Sepal.Length ~ Species - 1, -->
<!-- iris, -->
<!-- family = gaussian, -->
<!-- prior = normal(0, 1), -->
<!-- prior_intercept = normal(0, 1)) -->
<!-- ``` -->
<!-- Now, we have a posterior distributions of species means. -->
<!-- ```{r} -->
<!-- newdata <- data.frame(Species = unique(iris$Species)) -->
<!-- p_means <- posterior_linpred(m, newdata = newdata) %>% -->
<!-- as.data.frame() %>% -->
<!-- tibble::as_tibble() %>% -->
<!-- setNames(newdata$Species) %>% -->
<!-- tibble::rowid_to_column("draw") %>% -->
<!-- tidyr::gather(species, mean, -draw) %>% -->
<!-- print() -->
<!-- ``` -->
<!-- For each posterior sample, we can compute pairwise differences of means. -->
<!-- ```{r pairs, fig.width = 4, fig.height = 2.5} -->
<!-- pair_diffs <- compare_pairs(data, species, mean) %>% -->
<!-- print() -->
<!-- library(ggplot2) -->
<!-- ggplot(pair_diffs) + -->
<!-- aes(x = pair, y = value) + -->
<!-- stat_summary(fun.data = median_hilow, geom = "linerange") + -->
<!-- stat_summary(fun.data = median_hilow, fun.args = list(conf.int = .8), -->
<!-- size = 2, geom = "linerange") + -->
<!-- stat_summary(fun.y = median, size = 5, shape = 3, geom = "point") + -->
<!-- labs(x = NULL, y = "Difference in posterior means") + -->
<!-- coord_flip() -->
<!-- ``` -->
### Et cetera

`ggpreview()` is like ggplot2's `ggsave()` but it saves an image to a temporary file and then opens it in the system viewer. If you've ever found yourself in a loop of saving a plot, leaving RStudio to doubleclick the file, sighing, going back to RStudio, tweaking the height or width or plot theme, ever so slowly spiraling in on your desired plot, then `ggpreview()` is for you.

0 comments on commit 28b2504

Please sign in to comment.