First version of the fwrite function #580

Rdatatable · Mar 27, 2016 · d794b3b · mattdowle · Oct 28, 2016 · d794b3b
1 parent 6f58f5c
commit d794b3b
Show file tree

Hide file tree

Showing 7 changed files with 279 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -16,6 +16,7 @@ export(setNumericRounding, getNumericRounding)
 export(chmatch, "%chin%", chorder, chgroup)
 export(rbindlist)
 export(fread)
+export(fwrite)
 export(foverlaps)
 export(shift)
 export(transpose)

diff --git a/R/fwrite.R b/R/fwrite.R
@@ -0,0 +1,62 @@
+fwrite <- function(dt, file.path, append = FALSE, quote = TRUE,
+                   sep = ",", eol = "\n", col.names = TRUE, qmethod = "double",
+                   block.size = 10000) {
+
+  # validate arguments
+  stopifnot(is.data.frame(dt))
+  stopifnot(ncol(dt) > 0)
+  stopifnot(identical(unique(names(dt)), names(dt)))
+
+  stopifnot(length(quote) == 1 && class(quote) == "logical")
+  stopifnot(length(sep) == 1 && class(sep) == "character" && nchar(sep) == 1)
+  stopifnot(length(eol) == 1 && class(eol) == "character")
+  stopifnot(length(qmethod) == 1 && qmethod %in% c("double", "escape"))
+  stopifnot(length(col.names) == 1 && class(col.names) == "logical")
+  stopifnot(length(append) == 1 && class(append) == "logical")
+  stopifnot(length(block.size) == 1 && block.size > 0)
+
+  quoted_cols <- rep(quote, ncol(dt))
+
+  # special case: single-column data.frame, doing dt[block_begin:block_end,]
+  # for such data frame gives a vector
+  if (!is.data.table(dt) && ncol(dt) == 1) dt <- as.data.table(dt)
+
+  # write header row separately for correct quoting of row names
+  if (col.names && !append) {
+    .Call(Cwritefile, as.list(names(dt)), file.path, sep, eol, quoted_cols, qmethod == "escape", append)
+    append <- TRUE
+  }
+
+  # handle empty dt
+  if (nrow(dt) == 0) return()
+
+  # determine from column types, which ones should be quoted
+  if (quote) {
+    column_types <- lapply(dt, class)
+    quoted_cols <- column_types %in% c('character', 'factor')
+  }
+
+  # write in blocks of given size to avoid generating full copies
+  # of columns in memory
+  block_begin <- 1
+
+  repeat {
+    block_end <- min(block_begin+(block.size-1), nrow(dt))
+
+    dt_block <- dt[c(block_begin:block_end),]
+
+    # convert data.frame row block to a list of columns
+    col_list <- lapply(dt_block, function(column) {
+      str_col <- as.character(column)
+      str_col[is.na(str_col)] <- ''
+      str_col
+    })
+
+    .Call(Cwritefile, col_list, file.path, sep, eol, quoted_cols, qmethod == "escape", append)
+
+    if (block_end == nrow(dt)) break
+
+    append <- TRUE
+    block_begin <- block_end+1
+  }
+}
diff --git a/README.md b/README.md
@@ -73,6 +73,8 @@
   28. Joins (and binary search based subsets) using `on=` argument now reuses existing (secondary) indices, [#1439](https://github.com/Rdatatable/data.table/issues/1439). Thanks @jangorecki.
 
   29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).
+
+  30. New function `fwrite`. Fixes [#580](https://github.com/Rdatatable/data.table/issues/580). Thanks @oseiskar.
 
 #### BUG FIXES
 

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -8404,6 +8404,118 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p
 test(1639.138, ans, split(as.data.table(df), by=c("product","year")))
 test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE)))
 
+# fwrite tests
+local({
+  fwrite_test <- function(number, writer, expected_result) {
+    f <- tempfile()
+    writer(f)
+    result <- readChar(f, file.info(f)$size)
+    unlink(f)
+    test(number, result, expected_result)
+  }
+
+  # without quoting
+  fwrite_test(1640.1, function(f) {
+    fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), f, quote=F)
+  }, 'a,b\n,foo\n2,\n3.01,bar\n')
+
+  # with quoting and qmethod="escape"
+  fwrite_test(1640.2, function(f) {
+    fwrite(data.table(
+      a=c(NA, 2, 3.01),
+      `other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')),
+      f, quote=T, qmethod="escape")
+  }, '"a","other column"\n,"foo bar"\n2,""\n3.01,"quote\\" and \\\\ bs \n and newline"\n')
+
+  # with quoting and qmethod="double" (default)
+  fwrite_test(1640.3, function(f) {
+    fwrite(data.table(
+      a=c(NA, 1e-10, 3.01),
+      `other "column`=c('foo bar', NA, 'quote" and \\ bs')),
+      f, quote=T, qmethod="double")
+  }, '"a","other ""column"\n,"foo bar"\n1e-10,""\n3.01,"quote"" and \\ bs"\n')
+
+  # changing sep
+  fwrite_test(1640.4, function(f) { fwrite(data.table(a="foo", b="ba\"r"), f, sep=";") },
+              '"a";"b"\n"foo";"ba""r"\n')
+
+  # changing eol
+  fwrite_test(1640.5, function(f) { fwrite(data.table(a="foo", b="bar"), f, eol="\r\n") },
+              '"a","b"\r\n"foo","bar"\r\n')
+
+  # no col.names
+  fwrite_test(1640.6, function(f) { fwrite(data.table(a="foo", b="bar"), f, col.names=F) },
+              '"foo","bar"\n')
+
+  # small block size to assert that blocking works correctly
+  fwrite_test(1640.7, function(f) { fwrite(data.table(a=c(1:5), b=c(1:5)), f, block.size=2) },
+              '"a","b"\n1,1\n2,2\n3,3\n4,4\n5,5\n')
+
+  # block size equal to number of rows
+  fwrite_test(1640.8, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=3) },
+              '"a","b"\n1,1\n2,2\n3,3\n')
+
+  # block size one bigger than number of rows
+  fwrite_test(1640.9, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=4) },
+              '"a","b"\n1,1\n2,2\n3,3\n')
+
+  # block size one less than number of rows
+  fwrite_test(1640.10, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=2) },
+              '"a","b"\n1,1\n2,2\n3,3\n')
+
+  # writing a data.frame
+  fwrite_test(1640.11, function(f) { fwrite(data.frame(a="foo", b="bar"), f) },
+              '"a","b"\n"foo","bar"\n')
+
+  # single-column data.table
+  fwrite_test(1640.12, function(f) { fwrite(data.table(a=c(1,2,3)), f) },
+              '"a"\n1\n2\n3\n')
+
+  # single-column data.frame
+  fwrite_test(1640.13, function(f) { fwrite(data.frame(a=c(1,2,3)), f) },
+              '"a"\n1\n2\n3\n')
+
+  # factor columns
+  fwrite_test(1640.14, function(f) {
+    fwrite(data.table(a=as.factor(c('foo', 'bar')), b=as.factor(c(NA, "baz"))), f)
+  }, '"a","b"\n"foo",""\n"bar","baz"\n')
+
+  # empty data table (headers but no rows)
+  empty_dt <- data.table(a=1, b=2)[0,]
+  fwrite_test(1640.15, function(f) { fwrite(empty_dt, f) }, '"a","b"\n')
+
+  # test append
+  f <- tempfile()
+  fwrite(data.table(a=c(1,2), b=c('a', 'b')), f)
+  fwrite(data.table(a=c(3,4), b=c('c', 'd')), f, append=T)
+  res <- readChar(f, file.info(f)$size)
+  unlink(f)
+  test(1640.16, res, '"a","b"\n1,"a"\n2,"b"\n3,"c"\n4,"d"\n')
+
+  # simple data table (reference for the error cases below)
+  ok_dt <- data.table(foo="bar")
+  fwrite_test(1640.17, function(f) { fwrite(ok_dt, f) }, '"foo"\n"bar"\n')
+
+  # error cases
+  fwrite_expect_error <- function(test_number, writer) {
+    f <- tempfile()
+    was_error <- F
+    tryCatch(writer(f), error=function(e) { was_error <<- T })
+    test(test_number, TRUE, was_error)
+    stopifnot(!file.exists(f))
+  }
+
+  # wrong argument types
+  fwrite_expect_error(1640.18, function(f) {fwrite(ok_dt, 1)})
+  fwrite_expect_error(1640.19, function(f) {fwrite(ok_dt, f, quote=123)})
+  fwrite_expect_error(1640.20, function(f) {fwrite(ok_dt, f, sep="...")})
+  fwrite_expect_error(1640.21, function(f) {fwrite(ok_dt, f, qmethod=c("double", "double"))})
+  fwrite_expect_error(1640.22, function(f) {fwrite(ok_dt, f, col.names="foobar")})
+  fwrite_expect_error(1640.23, function(f) {fwrite(data.table(a=1, a=2), f)})
+
+  # null data table (no columns)
+  fwrite_expect_error(1640.24, function(f) {fwrite(data.table(a=1)[NULL,], f)})
+})
 
 ##########################
 

diff --git a/man/fwrite.Rd b/man/fwrite.Rd
@@ -0,0 +1,41 @@
+\name{fwrite}
+\alias{fwrite}
+\title{Fast CSV writer}
+\description{
+  Similar to \code{write.table} but faster and more limited in features.
+}
+\usage{
+fwrite(dt, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n",
+       col.names = TRUE, qmethod = "double", block.size = 10000)
+}
+\arguments{
+  \item{dt}{The \code{data.table} or \code{data.frame} to be write}
+  \item{file.path}{Output file name}
+  \item{append}{If \code{TRUE}, the file is opened in append mode and column names (header row) are not written.}
+  \item{quote}{If \code{TRUE}, all columns of character and factor types, as well as all column names, will be surrounded by double quotes. If \code{FALSE}, nothing is quoted, even if this would break the CSV (the column contents are not checked for separator characters).}
+  \item{sep}{The separator between columns}
+  \item{eol}{Line separator}
+  \item{col.names}{A logical value indicating if the column names (header row) should be written}
+  \item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. Must be one of "escape", in which case the quote character (as well as the backslash character) is escaped in C style by a backslash, or "double" (default), in which case it is doubled.}
+  \item{block.size}{The output is written in blocks, each of which contains at most this number of rows. This is to avoid making large copies in memory. Can be used to tweak performance and memory usage.}
+}
+\details{
+\code{NA} values are always represented by empty strings, quoted for character and factor columns if \code{quote=TRUE}.
+
+The speed-up compared to \code{write.csv} depends on the parameters and column types. This version should be significantly faster if the table consists mainly of character columns. In addition, improved performance can be expected if one can make do with \code{quote=FALSE}.
+}
+\seealso{ \code{\link[utils]{write.csv}} }
+\examples{
+\dontrun{
+
+fwrite(data.table(first=c(1,2), second=c(NA, 'foo"bar')), "table.csv")
+
+# table.csv contains:
+
+# "first","second"
+# "1",""
+# "2","foo""bar"
+}
+}
+\keyword{ data }
+
diff --git a/src/fwrite.c b/src/fwrite.c
@@ -0,0 +1,59 @@
+#include <R.h>
+#include <errno.h>
+#include <Rinternals.h>
+
+void writefile(SEXP list_of_columns,
+               SEXP filename,
+               SEXP col_sep_exp,
+               SEXP row_sep_exp,
+               SEXP quote_cols,
+               SEXP qmethod_escape_exp,
+               SEXP append) {
+
+  int error_number = 0;
+  int qmethod_escape = *LOGICAL(qmethod_escape_exp);
+
+  errno = 0; /* clear flag possibly set by previous errors */
+
+  char col_sep = *CHAR(STRING_ELT(col_sep_exp, 0));
+  const char *row_sep = CHAR(STRING_ELT(row_sep_exp, 0));
+  const char QUOTE_CHAR = '"';
+  const char ESCAPE_CHAR = '\\';
+
+  /* open input file in correct mode */
+  const char *open_mode = "wb";
+  if (*LOGICAL(append)) open_mode = "ab";
+  FILE *f = fopen(CHAR(STRING_ELT(filename, 0)), open_mode);
+  if (f == NULL) goto end;
+
+  R_xlen_t ncols = LENGTH(list_of_columns);
+  R_xlen_t nrows = LENGTH(VECTOR_ELT(list_of_columns, 0));
+
+  for (R_xlen_t row_i = 0; row_i < nrows; ++row_i) {
+    for (int col_i = 0; col_i < ncols; ++col_i) {
+
+      if (col_i > 0) fputc(col_sep, f);
+
+      int quote = LOGICAL(quote_cols)[col_i];
+
+      if (quote) fputc(QUOTE_CHAR, f);
+      for (const char *ch = CHAR(STRING_ELT(VECTOR_ELT(list_of_columns, col_i), row_i)); *ch != '\0'; ++ch) {
+        if (quote) {
+          if (*ch == QUOTE_CHAR) {
+            if (qmethod_escape) fputc(ESCAPE_CHAR, f);
+            else fputc(QUOTE_CHAR, f); /* qmethod = "double" */
+          }
+          if (qmethod_escape && *ch == ESCAPE_CHAR) fputc(ESCAPE_CHAR, f);
+        }
+        fputc(*ch, f);
+      }
+      if (quote) fputc(QUOTE_CHAR, f);
+    }
+    if (fputs(row_sep, f) < 0) goto end;
+  }
+
+  end:
+    error_number = errno;
+  if (f != NULL) fclose(f);
+  if (error_number) error(strerror(errno));
+}
diff --git a/src/init.c b/src/init.c
@@ -17,6 +17,7 @@ SEXP setcharvec();
 SEXP setcolorder();
 SEXP chmatchwrapper();
 SEXP readfile();
+SEXP writefile();
 SEXP reorder();
 SEXP rbindlist();
 SEXP vecseq();
@@ -87,6 +88,7 @@ R_CallMethodDef callMethods[] = {
 {"Csetcolorder", (DL_FUNC) &setcolorder, -1},
 {"Cchmatchwrapper", (DL_FUNC) &chmatchwrapper, -1},
 {"Creadfile", (DL_FUNC) &readfile, -1},
+{"Cwritefile", (DL_FUNC) &writefile, -1},
 {"Creorder", (DL_FUNC) &reorder, -1},
 {"Crbindlist", (DL_FUNC) &rbindlist, -1},
 {"Cvecseq", (DL_FUNC) &vecseq, -1},