pitrou · pitrou · Jul 23, 2019 · Jul 18, 2019 · Jul 22, 2019
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -42,16 +42,16 @@
 #' characters? This is more general than `escape_double` as backslashes
 #' can be used to escape the delimiter character, the quote character, or
 #' to add special characters like `\\n`.
-# #' @param col_names If `TRUE`, the first row of the input will be used as the
-# #' column names and will not be included in the data frame. Note that `FALSE`
-# #' is not currently supported, nor is specifying a character vector of column
-# #' names.
+#' @param col_names If `TRUE`, the first row of the input will be used as the
+#' column names and will not be included in the data frame. (Note that `FALSE`
+#' is not currently supported.) Alternatively, you can specify a character
+#' vector of column names.
 #' @param col_select A [tidy selection specification][tidyselect::vars_select]
 #' of columns, as used in `dplyr::select()`.
 #' @param skip_empty_rows Should blank rows be ignored altogether? If
 #' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
 #' filled with missings.
-# #' @param skip Number of lines to skip before reading data.
+#' @param skip Number of lines to skip before reading data.
 #' @param parse_options see [csv_parse_options()]. If given, this overrides any
 #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
 #' @param convert_options see [csv_convert_options()]
@@ -66,39 +66,41 @@ read_delim_arrow <- function(file,
                              quote = '"',
                              escape_double = TRUE,
                              escape_backslash = FALSE,
-                             # col_names = TRUE,
+                             col_names = TRUE,
                              # col_types = TRUE,
                              col_select = NULL,
                              # na = c("", "NA"),
                              # quoted_na = TRUE,
                              skip_empty_rows = TRUE,
-                             # skip = 0L,
+                             skip = 0L,
                              parse_options = NULL,
                              convert_options = NULL,
-                             read_options = csv_read_options(),
+                             read_options = NULL,
                              as_tibble = TRUE) {
 
-  # These are hardcoded pending https://issues.apache.org/jira/browse/ARROW-5747
-  col_names <- TRUE
-  skip <- 0L
-
+  if (identical(col_names, FALSE)) {
+    stop("Not implemented", call.=FALSE)
+  }
   if (is.null(parse_options)) {
-    if (isTRUE(col_names)) {
-      # Add one row to skip, to match arrow's header_rows
-      skip <- skip + 1L
-      # Note that with the hardcoding, header_rows is always 1, which
-      # turns out to be the only value that works meaningfully
-    }
     parse_options <- readr_to_csv_parse_options(
       delim,
       quote,
       escape_double,
       escape_backslash,
-      skip_empty_rows,
-      skip
+      skip_empty_rows
     )
   }
 
+  if (is.null(read_options)) {
+    if (isTRUE(col_names)) {
+      # C++ default to parse is 0-length string array
+      col_names <- character(0)
+    }
+    read_options <- csv_read_options(
+      skip_rows = skip,
+      column_names = col_names
+    )
+  }
   if (is.null(convert_options)) {
     # TODO:
     # * na strings (needs wiring in csv_convert_options)
@@ -117,10 +119,6 @@ read_delim_arrow <- function(file,
   )
 
   tab <- reader$Read()$select(!!enquo(col_select))
-  if (is.character(col_names)) {
-    # TODO: Rename `tab`'s columns
-    # See https://github.com/apache/arrow/pull/4557
-  }
 
   if (isTRUE(as_tibble)) {
     tab <- as.data.frame(tab)
@@ -135,16 +133,16 @@ read_csv_arrow <- function(file,
                            quote = '"',
                            escape_double = TRUE,
                            escape_backslash = FALSE,
-                           # col_names = TRUE,
+                           col_names = TRUE,
                            # col_types = TRUE,
                            col_select = NULL,
                            # na = c("", "NA"),
                            # quoted_na = TRUE,
                            skip_empty_rows = TRUE,
-                           # skip = 0L,
+                           skip = 0L,
                            parse_options = NULL,
                            convert_options = NULL,
-                           read_options = csv_read_options(),
+                           read_options = NULL,
                            as_tibble = TRUE) {
 
   mc <- match.call()
@@ -159,16 +157,16 @@ read_tsv_arrow <- function(file,
                            quote = '"',
                            escape_double = TRUE,
                            escape_backslash = FALSE,
-                           # col_names = TRUE,
+                           col_names = TRUE,
                            # col_types = TRUE,
                            col_select = NULL,
                            # na = c("", "NA"),
                            # quoted_na = TRUE,
                            skip_empty_rows = TRUE,
-                           # skip = 0L,
+                           skip = 0L,
                            parse_options = NULL,
                            convert_options = NULL,
-                           read_options = csv_read_options(),
+                           read_options = NULL,
                            as_tibble = TRUE) {
 
   mc <- match.call()
@@ -192,15 +190,25 @@ read_tsv_arrow <- function(file,
 #' Read options for the Arrow file readers
 #'
 #' @param use_threads Whether to use the global CPU thread pool
-#' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE`. NB: if false, JSON input must end with an empty line
+#' @param block_size Block size we request from the IO layer; also determines
+#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input
+#' must end with an empty line.
+#' @param skip_rows Number of lines to skip before reading data.
+#' @param column_names Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names.
 #'
 #' @export
 csv_read_options <- function(use_threads = option_use_threads(),
-                             block_size = 1048576L) {
+                             block_size = 1048576L,
+                             skip_rows = 0L,
+                             column_names = character(0)) {
   shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize(
     list(
       use_threads = use_threads,
-      block_size = block_size
+      block_size = block_size,
+      skip_rows = skip_rows,
+      column_names = column_names
     )
   ))
 }
@@ -209,8 +217,7 @@ readr_to_csv_parse_options <- function(delim = ",",
                                        quote = '"',
                                        escape_double = TRUE,
                                        escape_backslash = FALSE,
-                                       skip_empty_rows = TRUE,
-                                       skip = 0L) {
+                                       skip_empty_rows = TRUE) {
   # This function translates from the readr argument list to the arrow arg names
   # TODO: validate inputs
   csv_parse_options(
@@ -221,8 +228,7 @@ readr_to_csv_parse_options <- function(delim = ",",
     escaping = escape_backslash,
     escape_char = '\\',
     newlines_in_values = escape_backslash,
-    ignore_empty_lines = skip_empty_rows,
-    header_rows = skip
+    ignore_empty_lines = skip_empty_rows
   )
 }
 
@@ -236,7 +242,6 @@ readr_to_csv_parse_options <- function(delim = ",",
 #' @param escape_char Escaping character (if `escaping` is `TRUE`)
 #' @param newlines_in_values Whether values are allowed to contain CR (`0x0d`) and LF (`0x0a`) characters
 #' @param ignore_empty_lines Whether empty lines are ignored.  If `FALSE`, an empty line represents
-#' @param header_rows Number of header rows to skip (including the first row containing column names)
 #'
 #' @export
 csv_parse_options <- function(delimiter = ",",
@@ -246,8 +251,7 @@ csv_parse_options <- function(delimiter = ",",
                               escaping = FALSE,
                               escape_char = '\\',
                               newlines_in_values = FALSE,
-                              ignore_empty_lines = TRUE,
-                              header_rows = 1L) {
+                              ignore_empty_lines = TRUE) {
 
   shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize(
     list(
@@ -258,8 +262,7 @@ csv_parse_options <- function(delimiter = ",",
       escaping = escaping,
       escape_char = escape_char,
       newlines_in_values = newlines_in_values,
-      ignore_empty_lines = ignore_empty_lines,
-      header_rows = header_rows
+      ignore_empty_lines = ignore_empty_lines
     )
   ))
 }

diff --git a/r/README.md b/r/README.md
@@ -48,14 +48,6 @@ library.
 
 ``` r
 library(arrow)
-#> 
-#> Attaching package: 'arrow'
-#> The following object is masked from 'package:utils':
-#> 
-#>     timestamp
-#> The following objects are masked from 'package:base':
-#> 
-#>     array, table
 set.seed(24)
 
 tab <- arrow::table(x = 1:10, y = rnorm(10))

diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
@@ -28,6 +28,8 @@ std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(List_ opt
       std::make_shared<arrow::csv::ReadOptions>(arrow::csv::ReadOptions::Defaults());
   res->use_threads = options["use_threads"];
   res->block_size = options["block_size"];
+  res->skip_rows = options["skip_rows"];
+  res->column_names = Rcpp::as<std::vector<std::string>>(options["column_names"]);
   return res;
 }
 
@@ -43,7 +45,6 @@ std::shared_ptr<arrow::csv::ParseOptions> csv___ParseOptions__initialize(List_ o
   res->double_quote = options["double_quote"];
   res->escape_char = get_char(options["escape_char"]);
   res->newlines_in_values = options["newlines_in_values"];
-  res->header_rows = options["header_rows"];
   res->ignore_empty_lines = options["ignore_empty_lines"];
   return res;
 }

diff --git a/r/tests/testthat/test-arrow-csv.R b/r/tests/testthat/test-arrow-csv.R
@@ -81,29 +81,39 @@ test_that("read_delim_arrow parsing options: quote", {
 })
 
 test_that("read_csv_arrow parsing options: col_names", {
-  skip("Invalid: Empty CSV file")
   tf <- tempfile()
   on.exit(unlink(tf))
 
+  # Writing the CSV without the header
   write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE)
-  tab1 <- read_csv_arrow(tf, col_names = FALSE)
+
+  expect_error(read_csv_arrow(tf, col_names = FALSE), "Not implemented")
+
+  tab1 <- read_csv_arrow(tf, col_names = names(iris))
 
   expect_identical(names(tab1), names(iris))
   iris$Species <- as.character(iris$Species)
   expect_equivalent(iris, tab1)
+
+  # This errors (correctly) because I haven't given enough names
+  # but the error message is "Invalid: Empty CSV file", which is not accurate
+  expect_error(
+    read_csv_arrow(tf, col_names = names(iris)[1])
+  )
+  # Same here
+  expect_error(
+    read_csv_arrow(tf, col_names = c(names(iris), names(iris)))
+  )
 })
 
 test_that("read_csv_arrow parsing options: skip", {
-  skip("Invalid: Empty CSV file")
   tf <- tempfile()
   on.exit(unlink(tf))
 
+  # Adding two garbage lines to start the csv
   cat("asdf\nqwer\n", file = tf)
   suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append = TRUE))
-  # This works:
-  # print(head(readr::read_csv(tf, skip = 2)))
 
-  # This errors:
   tab1 <- read_csv_arrow(tf, skip = 2)
 
   expect_identical(names(tab1), names(iris))