Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion r/R/dataset-factory.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ DatasetFactory$create <- function(x,
#' @param ... Additional format-specific options, passed to
#' `FileFormat$create()`. For CSV options, note that you can specify them either
#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.)
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
#' Not all `readr` options are currently supported; please file an issue if you
#' encounter one that `arrow` should support.
#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
#' in a list potentially with other `DatasetFactory` objects, to create
#' a `Dataset`.
Expand Down
68 changes: 62 additions & 6 deletions r/R/dataset-format.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@
#'
#' `format = "text"`: see [CsvReadOptions]. Note that you can specify them either
#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.)
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
#' Not all `readr` options are currently supported; please file an issue if
#' you encounter one that `arrow` should support.
#'
#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
#' @rdname FileFormat
Expand Down Expand Up @@ -103,13 +105,67 @@ CsvFileFormat$create <- function(..., opts = csv_file_format_parse_options(...))
dataset___CsvFileFormat__Make(opts)
}

# Support both readr-style option names and Arrow C++ option names
csv_file_format_parse_options <- function(...) {
# Support both the readr spelling of options and the arrow spelling
readr_opts <- c("delim", "quote", "escape_double", "escape_backslash", "skip_empty_rows")
if (any(readr_opts %in% names(list(...)))) {
readr_to_csv_parse_options(...)
opt_names <- names(list(...))
# Catch any readr-style options specified with full option names that are
# supported by read_delim_arrow() (and its wrappers) but are not yet
# supported here
unsup_readr_opts <- setdiff(
names(formals(read_delim_arrow)),
names(formals(readr_to_csv_parse_options))
)
is_unsup_opt <- opt_names %in% unsup_readr_opts
unsup_opts <- opt_names[is_unsup_opt]
if (length(unsup_opts)) {
stop(
"The following ",
ngettext(length(unsup_opts), "option is ", "options are "),
"supported in \"read_delim_arrow\" functions ",
"but not yet supported here: ",
oxford_paste(unsup_opts),
call. = FALSE
)
}
# Catch any options with full or partial names that do not match any of the
# recognized Arrow C++ option names or readr-style option names
arrow_opts <- names(formals(CsvParseOptions$create))
readr_opts <- names(formals(readr_to_csv_parse_options))
is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
if (length(unrec_opts)) {
stop(
"Unrecognized ",
ngettext(length(unrec_opts), "option", "options"),
": ",
oxford_paste(unrec_opts),
call. = FALSE
)
}
# Catch options with ambiguous partial names (such as "del") that make it
# unclear whether the user is specifying Arrow C++ options ("delimiter") or
# readr-style options ("delim")
is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
ambig_opts <- opt_names[is_ambig_opt]
if (length(ambig_opts)) {
stop("Ambiguous ",
ngettext(length(ambig_opts), "option", "options"),
": ",
oxford_paste(ambig_opts),
". Use full argument names",
call. = FALSE)
}
if (any(is_readr_opt)) {
# Catch cases when the user specifies a mix of Arrow C++ options and
# readr-style options
if (!all(is_readr_opt)) {
stop("Use either Arrow parse options or readr parse options, not both",
call. = FALSE)
}
readr_to_csv_parse_options(...) # all options have readr-style names
} else {
CsvParseOptions$create(...)
CsvParseOptions$create(...) # all options have Arrow C++ names
}
}

Expand Down
4 changes: 3 additions & 1 deletion r/man/FileFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion r/man/dataset_factory.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 57 additions & 3 deletions r/tests/testthat/test-dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,65 @@ test_that("Other text delimited dataset", {
filter(integer > 6) %>%
summarize(mean = mean(integer))
)
})

test_that("readr parse options", {
arrow_opts <- names(formals(CsvParseOptions$create))
readr_opts <- names(formals(readr_to_csv_parse_options))

# Arrow and readr parse options must be mutually exclusive, or else the code
# in `csv_file_format_parse_options()` will error or behave incorrectly. A
# failure of this test indicates that these two sets of option names are not
# mutually exclusive.
expect_equal(
intersect(arrow_opts, readr_opts),
character(0)
)

# Now with readr option spelling (and omitting format = "text")
ds3 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t")
# With not yet supported readr parse options (ARROW-8631)
expect_error(
open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
"supported"
)

# With unrecognized (garbage) parse options
expect_error(
open_dataset(
tsv_dir,
partitioning = "part",
format = "text",
asdfg = "\\"
),
"Unrecognized"
)

# With both Arrow and readr parse options (disallowed)
expect_error(
open_dataset(
tsv_dir,
partitioning = "part",
format = "text",
quote = "\"",
quoting = TRUE
),
"either"
)

# With ambiguous partial option names (disallowed)
expect_error(
open_dataset(
tsv_dir,
partitioning = "part",
format = "text",
quo = "\"",
),
"Ambiguous"
)

# With only readr parse options (and omitting format = "text")
ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t")
expect_equivalent(
ds3 %>%
ds1 %>%
select(string = chr, integer = int, part) %>%
filter(integer > 6 & part == 5) %>%
collect() %>%
Expand Down