Description
openedon Jul 29, 2020
Setting user_na = TRUE
in the haven::read_spss() command creates a file that produces an error when used subsequently in a dplyr::mutate() command.
The following reprex has two examples: the first example, mygss1, shows that haven::read_spss() with user_na=FALSE (the default) works as expected. It creates a file that can be manipulated with dplyr::mutate() without error.
The second example, mygss2, is created by using haven::read_spss(..., user_na = TRUE) and leads to an error when that file is subsequently used by dplyr::mutate.
This issue was originally submitted as an issue for dplyr (tidyverse/dplyr#5424). That issue was closed because it is appeared to be an issue with a different package and not with dplyr. Thus, this issue is being raised for the haven package. I notice that:
The user_na = FALSE
option leads to the list of classes for the variables in the dataset to be:
$ class : chr [1:3] "haven_labelled" "vctrs_vctr" "double"
While, the user_na = TRUE
option leads to the list of classes for the variables in the dataset to be:
$ class : chr [1:4] "haven_labelled_spss" "haven_labelled" "vctrs_vctr" "double"
Since UseMethod() only considers the first two classes of the object for dispatch, could the additional class in the front of the list of classes be causing the error?
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(haven)
# Download the 2018 GSS dataset in SPSS format from the GSS website:
temp1 <- tempfile()
download.file("https://gss.norc.org/Documents/spss/2018_spss.zip", temp1)
unzip(temp1,"GSS2018.sav")
unlink(temp1)
# Read in SPSS using haven without user_na option (this works as expected)
mygss1 <- haven::read_spss("GSS2018.sav", col_select = c("RACE","TAX"))
summary(mygss1)
#> RACE TAX
#> Min. :1.000 Min. :1.000
#> 1st Qu.:1.000 1st Qu.:1.000
#> Median :1.000 Median :1.000
#> Mean :1.394 Mean :1.483
#> 3rd Qu.:2.000 3rd Qu.:2.000
#> Max. :3.000 Max. :3.000
#> NA's :817
#
# View the attributes for debugging
#
for (x in names(mygss1)) {
print(x)
str(attributes(mygss1[[x]]))
print_labels(mygss1[[x]])
print(table(mygss1[[x]]))
}
#> [1] "RACE"
#> List of 4
#> $ label : chr "Race of respondent"
#> $ format.spss: chr "F1.0"
#> $ class : chr [1:3] "haven_labelled" "vctrs_vctr" "double"
#> $ labels : Named num [1:4] 0 1 2 3
#> ..- attr(*, "names")= chr [1:4] "IAP" "WHITE" "BLACK" "OTHER"
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 WHITE
#> 2 BLACK
#> 3 OTHER
#>
#> 1 2 3
#> 1693 385 270
#> [1] "TAX"
#> List of 4
#> $ label : chr "R's federal income tax"
#> $ format.spss: chr "F1.0"
#> $ class : chr [1:3] "haven_labelled" "vctrs_vctr" "double"
#> $ labels : Named num [1:7] 0 1 2 3 4 8 9
#> ..- attr(*, "names")= chr [1:7] "IAP" "TOO HIGH" "ABOUT RIGHT" "TOO LOW" ...
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 TOO HIGH
#> 2 ABOUT RIGHT
#> 3 TOO LOW
#> 4 R PAYS NONE<VOL.>
#> 8 DK
#> 9 NA
#>
#> 1 2 3
#> 829 664 38
mygss1 <- mygss1 %>%
mutate(hightax = case_when(TAX == 1 ~ 1,
TAX == 2 | TAX == 3 ~ 0))
########################################################
#
# Error occurs if read_SPSS is used WITH user_na = TRUE
#
########################################################
mygss2 <- haven::read_spss("GSS2018.sav", col_select = c("RACE","TAX"), user_na = TRUE)
summary(mygss2)
#> RACE TAX
#> Min. :1.000 Min. :0.000
#> 1st Qu.:1.000 1st Qu.:0.000
#> Median :1.000 Median :1.000
#> Mean :1.394 Mean :1.115
#> 3rd Qu.:2.000 3rd Qu.:2.000
#> Max. :3.000 Max. :9.000
#
# View the attributes for debugging
#
for (x in names(mygss2)) {
print(x)
str(attributes(mygss2[[x]]))
print_labels(mygss2[[x]])
print(table(mygss2[[x]]))
}
#> [1] "RACE"
#> List of 5
#> $ label : chr "Race of respondent"
#> $ na_values : num 0
#> $ class : chr [1:4] "haven_labelled_spss" "haven_labelled" "vctrs_vctr" "double"
#> $ format.spss: chr "F1.0"
#> $ labels : Named num [1:4] 0 1 2 3
#> ..- attr(*, "names")= chr [1:4] "IAP" "WHITE" "BLACK" "OTHER"
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 WHITE
#> 2 BLACK
#> 3 OTHER
#>
#> 1 2 3
#> 1693 385 270
#> [1] "TAX"
#> List of 5
#> $ label : chr "R's federal income tax"
#> $ na_values : num [1:3] 0 8 9
#> $ class : chr [1:4] "haven_labelled_spss" "haven_labelled" "vctrs_vctr" "double"
#> $ format.spss: chr "F1.0"
#> $ labels : Named num [1:7] 0 1 2 3 4 8 9
#> ..- attr(*, "names")= chr [1:7] "IAP" "TOO HIGH" "ABOUT RIGHT" "TOO LOW" ...
#>
#> Labels:
#> value label
#> 0 IAP
#> 1 TOO HIGH
#> 2 ABOUT RIGHT
#> 3 TOO LOW
#> 4 R PAYS NONE<VOL.>
#> 8 DK
#> 9 NA
#>
#> 0 1 2 3 8 9
#> 774 829 664 38 39 4
mygss2 <- mygss2 %>%
mutate(hightax = case_when(TAX == 1 ~ 1,
TAX == 2 | TAX == 3 ~ 0))
#> Error: Problem with `mutate()` input `hightax`.
#> x Can't combine `..1` <labelled_spss<double>> and `..2` <double>.
#> ℹ Input `hightax` is `case_when(TAX == 1 ~ 1, TAX == 2 | TAX == 3 ~ 0)`.
sessionInfo()
#> R version 3.6.3 (2020-02-29)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] haven_2.3.1 dplyr_1.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.5 knitr_1.29 magrittr_1.5 hms_0.5.3
#> [5] tidyselect_1.1.0 R6_2.4.1 rlang_0.4.7 fansi_0.4.1
#> [9] stringr_1.4.0 highr_0.8 tools_3.6.3 xfun_0.15
#> [13] cli_2.0.2 htmltools_0.5.0 ellipsis_0.3.1 assertthat_0.2.1
#> [17] yaml_2.2.1 digest_0.6.25 tibble_3.0.3 lifecycle_0.2.0
#> [21] crayon_1.3.4 readr_1.3.1 purrr_0.3.4 vctrs_0.3.2
#> [25] glue_1.4.1 evaluate_0.14 rmarkdown_2.3 stringi_1.4.6
#> [29] compiler_3.6.3 pillar_1.4.6 forcats_0.5.0 generics_0.0.2
#> [33] pkgconfig_2.0.3
Created on 2020-07-29 by the reprex package (v0.3.0)