-
-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathcor_test.R
318 lines (292 loc) · 13.6 KB
/
cor_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#' Correlation test
#'
#' This function performs a correlation test between two variables.
#' You can easily visualize the result using [`plot()`][visualisation_recipe.easycormatrix()] (see examples [**here**](https://easystats.github.io/correlation/reference/visualisation_recipe.easycormatrix.html#ref-examples)).
#'
#' @param data A data frame.
#' @param x,y Names of two variables present in the data.
#' @param ci Confidence/Credible Interval level. If `"default"`, then it is
#' set to `0.95` (`95%` CI).
#' @param method A character string indicating which correlation coefficient is
#' to be used for the test. One of `"pearson"` (default), `"kendall"`,
#' `"spearman"` (but see also the `robust` argument), `"biserial"`,
#' `"polychoric"`, `"tetrachoric"`, `"biweight"`, `"distance"`, `"percentage"`
#' (for percentage bend correlation), `"blomqvist"` (for Blomqvist's
#' coefficient), `"hoeffding"` (for Hoeffding's D), `"gamma"`, `"gaussian"`
#' (for Gaussian Rank correlation) or `"shepherd"` (for Shepherd's Pi
#' correlation). Setting `"auto"` will attempt at selecting the most relevant
#' method (polychoric when ordinal factors involved, tetrachoric when
#' dichotomous factors involved, point-biserial if one dichotomous and one
#' continuous and pearson otherwise). See below the **details** section for a
#' description of these indices.
#' @param bayesian If `TRUE`, will run the correlations under a Bayesian
#' framework.
#' @param partial_bayesian If partial correlations under a Bayesian framework
#' are needed, you will also need to set `partial_bayesian` to `TRUE` to
#' obtain "full" Bayesian partial correlations. Otherwise, you will obtain
#' pseudo-Bayesian partial correlations (i.e., Bayesian correlation based on
#' frequentist partialization).
#' @param include_factors If `TRUE`, the factors are kept and eventually
#' converted to numeric or used as random effects (depending of `multilevel`).
#' If `FALSE`, factors are removed upfront.
#' @param partial Can be `TRUE` or `"semi"` for partial and semi-partial
#' correlations, respectively.
#' @inheritParams datawizard::adjust
#' @param bayesian_prior For the prior argument, several named values are
#' recognized: `"medium.narrow"`, `"medium"`, `"wide"`, and `"ultrawide"`.
#' These correspond to scale values of `1/sqrt(27)`, `1/3`, `1/sqrt(3)` and
#' `1`, respectively. See the `BayesFactor::correlationBF` function.
#' @param bayesian_ci_method,bayesian_test See arguments in
#' [`parameters::model_parameters()`] for `BayesFactor` tests.
#' @param ranktransform If `TRUE`, will rank-transform the variables prior to
#' estimating the correlation, which is one way of making the analysis more
#' resistant to extreme values (outliers). Note that, for instance, a
#' Pearson's correlation on rank-transformed data is equivalent to a
#' Spearman's rank correlation. Thus, using `robust=TRUE` and
#' `method="spearman"` is redundant. Nonetheless, it is an easy option to
#' increase the robustness of the correlation as well as flexible way to
#' obtain Bayesian or multilevel Spearman-like rank correlations.
#' @param winsorize Another way of making the correlation more "robust" (i.e.,
#' limiting the impact of extreme values). Can be either `FALSE` or a number
#' between 0 and 1 (e.g., `0.2`) that corresponds to the desired threshold.
#' See the [`datawizard::winsorize()`] function for more details.
#' @param verbose Toggle warnings.
#' @param ... Additional arguments (e.g., `alternative`) to be passed to
#' other methods. See `stats::cor.test` for further details.
#'
#'
#' @inherit correlation details
#'
#' @examples
#' library(correlation)
#'
#' cor_test(iris, "Sepal.Length", "Sepal.Width")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "spearman")
#' \donttest{
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "kendall")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "biweight")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "distance")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "percentage")
#'
#' if (require("wdm", quietly = TRUE)) {
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "blomqvist")
#' }
#'
#' if (require("Hmisc", quietly = TRUE)) {
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "hoeffding")
#' }
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "gamma")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "gaussian")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "shepherd")
#' if (require("BayesFactor", quietly = TRUE)) {
#' cor_test(iris, "Sepal.Length", "Sepal.Width", bayesian = TRUE)
#' }
#'
#' # Robust (these two are equivalent)
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "spearman")
#' cor_test(iris, "Sepal.Length", "Sepal.Width", method = "pearson", ranktransform = TRUE)
#'
#' # Winsorized
#' cor_test(iris, "Sepal.Length", "Sepal.Width", winsorize = 0.2)
#'
#' # Tetrachoric
#' if (require("psych", quietly = TRUE) && require("rstanarm", quietly = TRUE)) {
#' data <- iris
#' data$Sepal.Width_binary <- ifelse(data$Sepal.Width > 3, 1, 0)
#' data$Petal.Width_binary <- ifelse(data$Petal.Width > 1.2, 1, 0)
#' cor_test(data, "Sepal.Width_binary", "Petal.Width_binary", method = "tetrachoric")
#'
#' # Biserial
#' cor_test(data, "Sepal.Width", "Petal.Width_binary", method = "biserial")
#'
#' # Polychoric
#' data$Petal.Width_ordinal <- as.factor(round(data$Petal.Width))
#' data$Sepal.Length_ordinal <- as.factor(round(data$Sepal.Length))
#' cor_test(data, "Petal.Width_ordinal", "Sepal.Length_ordinal", method = "polychoric")
#'
#' # When one variable is continuous, will run 'polyserial' correlation
#' cor_test(data, "Sepal.Width", "Sepal.Length_ordinal", method = "polychoric")
#' }
#'
#' # Partial
#' cor_test(iris, "Sepal.Length", "Sepal.Width", partial = TRUE)
#' if (require("lme4", quietly = TRUE)) {
#' cor_test(iris, "Sepal.Length", "Sepal.Width", multilevel = TRUE)
#' }
#' if (require("rstanarm", quietly = TRUE)) {
#' cor_test(iris, "Sepal.Length", "Sepal.Width", partial_bayesian = TRUE)
#' }
#' }
#' @export
cor_test <- function(data,
x,
y,
method = "pearson",
ci = 0.95,
bayesian = FALSE,
bayesian_prior = "medium",
bayesian_ci_method = "hdi",
bayesian_test = c("pd", "rope", "bf"),
include_factors = FALSE,
partial = FALSE,
partial_bayesian = FALSE,
multilevel = FALSE,
ranktransform = FALSE,
winsorize = FALSE,
verbose = TRUE,
...) {
# valid matrix checks
if (!all(x %in% names(data)) || !all(y %in% names(data))) {
insight::format_error("The names you entered for x and y are not available in the dataset. Make sure there are no typos!")
}
if (ci == "default") ci <- 0.95
if (!partial && (partial_bayesian || multilevel)) partial <- TRUE
# Make sure factor is no factor
if (!method %in% c("tetra", "tetrachoric", "poly", "polychoric")) {
data[c(x, y)] <- datawizard::to_numeric(data[c(x, y)], dummy_factors = FALSE)
}
# Partial
if (!isFALSE(partial)) {
# partial
if (isTRUE(partial)) {
data[[x]] <- datawizard::adjust(data[names(data) != y], multilevel = multilevel, bayesian = partial_bayesian)[[x]]
data[[y]] <- datawizard::adjust(data[names(data) != x], multilevel = multilevel, bayesian = partial_bayesian)[[y]]
}
# semi-partial
if (partial == "semi") {
insight::format_error("Semi-partial correlations are not supported yet. Get in touch if you want to contribute.")
}
}
# Winsorize
if (!isFALSE(winsorize) && !is.null(winsorize)) {
# set default (if not specified)
if (isTRUE(winsorize)) {
winsorize <- 0.2
}
# winsorization would otherwise fail in case of NAs present
data <- as.data.frame(
datawizard::winsorize(stats::na.omit(data[c(x, y)]),
threshold = winsorize,
verbose = verbose
)
)
}
# Rank transform (i.e., "robust")
if (ranktransform) {
data[c(x, y)] <- datawizard::ranktransform(data[c(x, y)], sign = FALSE, method = "average")
}
# check if enough no. of obs ------------------------------
# this is a trick in case the number of valid observations is lower than 3
n_obs <- length(.complete_variable_x(data, x, y))
invalid <- FALSE
if (n_obs < 3L) {
if (isTRUE(verbose)) {
insight::format_warning(paste(x, "and", y, "have less than 3 complete observations. Returning NA."))
}
invalid <- TRUE
original_info <- list(data = data, x = x, y = y)
data <- datasets::mtcars # Basically use a working dataset so the correlation doesn't fail
x <- "mpg"
y <- "disp"
}
# Find method
method <- tolower(method)
if (method == "auto" && !bayesian) method <- .find_correlationtype(data, x, y)
if (method == "auto" && bayesian) method <- "pearson"
# Frequentist
if (!bayesian) {
if (method %in% c("tetra", "tetrachoric")) {
out <- .cor_test_tetrachoric(data, x, y, ci = ci, ...)
} else if (method %in% c("poly", "polychoric")) {
out <- .cor_test_polychoric(data, x, y, ci = ci, ...)
} else if (method %in% c("biserial", "pointbiserial", "point-biserial")) {
out <- .cor_test_biserial(data, x, y, ci = ci, method = method, ...)
} else if (method == "biweight") {
out <- .cor_test_biweight(data, x, y, ci = ci, ...)
} else if (method == "distance") {
out <- .cor_test_distance(data, x, y, ci = ci, ...)
} else if (method %in% c("percentage", "percentage_bend", "percentagebend", "pb")) {
out <- .cor_test_percentage(data, x, y, ci = ci, ...)
} else if (method %in% c("blomqvist", "median", "medial")) {
out <- .cor_test_blomqvist(data, x, y, ci = ci, ...)
} else if (method == "hoeffding") {
out <- .cor_test_hoeffding(data, x, y, ci = ci, ...)
} else if (method == "somers") {
out <- .cor_test_somers(data, x, y, ci = ci, ...)
} else if (method == "gamma") {
out <- .cor_test_gamma(data, x, y, ci = ci, ...)
} else if (method == "gaussian") {
out <- .cor_test_gaussian(data, x, y, ci = ci, ...)
} else if (method %in% c("shepherd", "sheperd", "shepherdspi", "pi")) {
out <- .cor_test_shepherd(data, x, y, ci = ci, bayesian = FALSE, ...)
} else {
out <- .cor_test_freq(data, x, y, ci = ci, method = method, ...)
}
# Bayesian
} else if (method %in% c("tetra", "tetrachoric")) {
insight::format_error("Tetrachoric Bayesian correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method %in% c("poly", "polychoric")) {
insight::format_error("Polychoric Bayesian correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method %in% c("biserial", "pointbiserial", "point-biserial")) {
insight::format_error("Biserial Bayesian correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method == "biweight") {
insight::format_error("Biweight Bayesian correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method == "distance") {
insight::format_error("Bayesian distance correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method %in% c("percentage", "percentage_bend", "percentagebend", "pb")) {
insight::format_error("Bayesian Percentage Bend correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method %in% c("blomqvist", "median", "medial")) {
insight::format_error("Bayesian Blomqvist correlations are not supported yet. Check-out the BBcor package (https://github.com/donaldRwilliams/BBcor).")
} else if (method == "hoeffding") {
insight::format_error("Bayesian Hoeffding's correlations are not supported yet. Check-out the BBcor package (https://github.com/donaldRwilliams/BBcor).")
} else if (method == "gamma") {
insight::format_error("Bayesian gamma correlations are not supported yet. Get in touch if you want to contribute.")
} else if (method %in% c("shepherd", "sheperd", "shepherdspi", "pi")) {
out <- .cor_test_shepherd(data, x, y, ci = ci, bayesian = TRUE, ...)
} else {
out <- .cor_test_bayes(
data,
x,
y,
ci = ci,
method = method,
bayesian_prior = bayesian_prior,
bayesian_ci_method = bayesian_ci_method,
bayesian_test = bayesian_test,
...
)
}
# Replace by NANs if invalid
if (isTRUE(invalid)) {
data <- original_info$data
out$Parameter1 <- original_info$x
out$Parameter2 <- original_info$y
out[!names(out) %in% c("Parameter1", "Parameter2")] <- NA
}
# Number of observations and CI
out$n_Obs <- n_obs
out$CI <- ci
# Reorder columns
if ("CI_low" %in% names(out)) {
col_order <- c("Parameter1", "Parameter2", "r", "rho", "tau", "Dxy", "CI", "CI_low", "CI_high")
out <- out[c(
col_order[col_order %in% names(out)],
setdiff(colnames(out), col_order[col_order %in% names(out)])
)]
}
# Output
attr(out, "coefficient_name") <- c("rho", "r", "tau", "Dxy")[c("rho", "r", "tau", "Dxy") %in% names(out)][1]
attr(out, "ci") <- ci
attr(out, "data") <- data
class(out) <- unique(c("easycor_test", "easycorrelation", "parameters_model", class(out)))
out
}
# Utilities ---------------------------------------------------------------
#' @keywords internal
.complete_variable_x <- function(data, x, y) {
data[[x]][stats::complete.cases(data[[x]], data[[y]])]
}
#' @keywords internal
.complete_variable_y <- function(data, x, y) {
data[[y]][stats::complete.cases(data[[x]], data[[y]])]
}