Skip to content

Commit

Permalink
First version of the fwrite function #580
Browse files Browse the repository at this point in the history
  • Loading branch information
oseiskar committed Mar 27, 2016
1 parent 6f58f5c commit d794b3b
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export(setNumericRounding, getNumericRounding)
export(chmatch, "%chin%", chorder, chgroup)
export(rbindlist)
export(fread)
export(fwrite)
export(foverlaps)
export(shift)
export(transpose)
Expand Down
62 changes: 62 additions & 0 deletions R/fwrite.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
fwrite <- function(dt, file.path, append = FALSE, quote = TRUE,
sep = ",", eol = "\n", col.names = TRUE, qmethod = "double",
block.size = 10000) {

# validate arguments
stopifnot(is.data.frame(dt))
stopifnot(ncol(dt) > 0)
stopifnot(identical(unique(names(dt)), names(dt)))

stopifnot(length(quote) == 1 && class(quote) == "logical")
stopifnot(length(sep) == 1 && class(sep) == "character" && nchar(sep) == 1)
stopifnot(length(eol) == 1 && class(eol) == "character")
stopifnot(length(qmethod) == 1 && qmethod %in% c("double", "escape"))
stopifnot(length(col.names) == 1 && class(col.names) == "logical")
stopifnot(length(append) == 1 && class(append) == "logical")
stopifnot(length(block.size) == 1 && block.size > 0)

quoted_cols <- rep(quote, ncol(dt))

# special case: single-column data.frame, doing dt[block_begin:block_end,]
# for such data frame gives a vector
if (!is.data.table(dt) && ncol(dt) == 1) dt <- as.data.table(dt)

# write header row separately for correct quoting of row names
if (col.names && !append) {
.Call(Cwritefile, as.list(names(dt)), file.path, sep, eol, quoted_cols, qmethod == "escape", append)
append <- TRUE
}

# handle empty dt
if (nrow(dt) == 0) return()

# determine from column types, which ones should be quoted
if (quote) {
column_types <- lapply(dt, class)
quoted_cols <- column_types %in% c('character', 'factor')
}

# write in blocks of given size to avoid generating full copies
# of columns in memory
block_begin <- 1

repeat {
block_end <- min(block_begin+(block.size-1), nrow(dt))

dt_block <- dt[c(block_begin:block_end),]

# convert data.frame row block to a list of columns
col_list <- lapply(dt_block, function(column) {
str_col <- as.character(column)
str_col[is.na(str_col)] <- ''
str_col
})

.Call(Cwritefile, col_list, file.path, sep, eol, quoted_cols, qmethod == "escape", append)

if (block_end == nrow(dt)) break

append <- TRUE
block_begin <- block_end+1
}
}
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@
28. Joins (and binary search based subsets) using `on=` argument now reuses existing (secondary) indices, [#1439](https://github.com/Rdatatable/data.table/issues/1439). Thanks @jangorecki.

29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389).

30. New function `fwrite`. Fixes [#580](https://github.com/Rdatatable/data.table/issues/580). Thanks @oseiskar.

#### BUG FIXES

Expand Down
112 changes: 112 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8404,6 +8404,118 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p
test(1639.138, ans, split(as.data.table(df), by=c("product","year")))
test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE)))

# fwrite tests
local({
fwrite_test <- function(number, writer, expected_result) {
f <- tempfile()
writer(f)
result <- readChar(f, file.info(f)$size)
unlink(f)
test(number, result, expected_result)
}

# without quoting
fwrite_test(1640.1, function(f) {
fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), f, quote=F)
}, 'a,b\n,foo\n2,\n3.01,bar\n')

# with quoting and qmethod="escape"
fwrite_test(1640.2, function(f) {
fwrite(data.table(
a=c(NA, 2, 3.01),
`other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')),
f, quote=T, qmethod="escape")
}, '"a","other column"\n,"foo bar"\n2,""\n3.01,"quote\\" and \\\\ bs \n and newline"\n')

# with quoting and qmethod="double" (default)
fwrite_test(1640.3, function(f) {
fwrite(data.table(
a=c(NA, 1e-10, 3.01),
`other "column`=c('foo bar', NA, 'quote" and \\ bs')),
f, quote=T, qmethod="double")
}, '"a","other ""column"\n,"foo bar"\n1e-10,""\n3.01,"quote"" and \\ bs"\n')

# changing sep
fwrite_test(1640.4, function(f) { fwrite(data.table(a="foo", b="ba\"r"), f, sep=";") },
'"a";"b"\n"foo";"ba""r"\n')

# changing eol
fwrite_test(1640.5, function(f) { fwrite(data.table(a="foo", b="bar"), f, eol="\r\n") },
'"a","b"\r\n"foo","bar"\r\n')

# no col.names
fwrite_test(1640.6, function(f) { fwrite(data.table(a="foo", b="bar"), f, col.names=F) },
'"foo","bar"\n')

# small block size to assert that blocking works correctly
fwrite_test(1640.7, function(f) { fwrite(data.table(a=c(1:5), b=c(1:5)), f, block.size=2) },
'"a","b"\n1,1\n2,2\n3,3\n4,4\n5,5\n')

# block size equal to number of rows
fwrite_test(1640.8, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=3) },
'"a","b"\n1,1\n2,2\n3,3\n')

# block size one bigger than number of rows
fwrite_test(1640.9, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=4) },
'"a","b"\n1,1\n2,2\n3,3\n')

# block size one less than number of rows
fwrite_test(1640.10, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=2) },
'"a","b"\n1,1\n2,2\n3,3\n')

# writing a data.frame
fwrite_test(1640.11, function(f) { fwrite(data.frame(a="foo", b="bar"), f) },
'"a","b"\n"foo","bar"\n')

# single-column data.table
fwrite_test(1640.12, function(f) { fwrite(data.table(a=c(1,2,3)), f) },
'"a"\n1\n2\n3\n')

# single-column data.frame
fwrite_test(1640.13, function(f) { fwrite(data.frame(a=c(1,2,3)), f) },
'"a"\n1\n2\n3\n')

# factor columns
fwrite_test(1640.14, function(f) {
fwrite(data.table(a=as.factor(c('foo', 'bar')), b=as.factor(c(NA, "baz"))), f)
}, '"a","b"\n"foo",""\n"bar","baz"\n')

# empty data table (headers but no rows)
empty_dt <- data.table(a=1, b=2)[0,]
fwrite_test(1640.15, function(f) { fwrite(empty_dt, f) }, '"a","b"\n')

# test append
f <- tempfile()
fwrite(data.table(a=c(1,2), b=c('a', 'b')), f)
fwrite(data.table(a=c(3,4), b=c('c', 'd')), f, append=T)
res <- readChar(f, file.info(f)$size)
unlink(f)
test(1640.16, res, '"a","b"\n1,"a"\n2,"b"\n3,"c"\n4,"d"\n')

# simple data table (reference for the error cases below)
ok_dt <- data.table(foo="bar")
fwrite_test(1640.17, function(f) { fwrite(ok_dt, f) }, '"foo"\n"bar"\n')

# error cases
fwrite_expect_error <- function(test_number, writer) {

This comment has been minimized.

Copy link
@mattdowle

mattdowle Oct 28, 2016

Member

Great tests. Just small note that data.table's test() function already has an error= argument to test for expected error and that the error contains that text. Will replace fwrite_expect_error with using test(...,error="...").

f <- tempfile()
was_error <- F
tryCatch(writer(f), error=function(e) { was_error <<- T })
test(test_number, TRUE, was_error)
stopifnot(!file.exists(f))
}

# wrong argument types
fwrite_expect_error(1640.18, function(f) {fwrite(ok_dt, 1)})
fwrite_expect_error(1640.19, function(f) {fwrite(ok_dt, f, quote=123)})
fwrite_expect_error(1640.20, function(f) {fwrite(ok_dt, f, sep="...")})
fwrite_expect_error(1640.21, function(f) {fwrite(ok_dt, f, qmethod=c("double", "double"))})
fwrite_expect_error(1640.22, function(f) {fwrite(ok_dt, f, col.names="foobar")})
fwrite_expect_error(1640.23, function(f) {fwrite(data.table(a=1, a=2), f)})

# null data table (no columns)
fwrite_expect_error(1640.24, function(f) {fwrite(data.table(a=1)[NULL,], f)})
})

##########################

Expand Down
41 changes: 41 additions & 0 deletions man/fwrite.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
\name{fwrite}
\alias{fwrite}
\title{Fast CSV writer}
\description{
Similar to \code{write.table} but faster and more limited in features.
}
\usage{
fwrite(dt, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n",
col.names = TRUE, qmethod = "double", block.size = 10000)
}
\arguments{
\item{dt}{The \code{data.table} or \code{data.frame} to be write}
\item{file.path}{Output file name}
\item{append}{If \code{TRUE}, the file is opened in append mode and column names (header row) are not written.}
\item{quote}{If \code{TRUE}, all columns of character and factor types, as well as all column names, will be surrounded by double quotes. If \code{FALSE}, nothing is quoted, even if this would break the CSV (the column contents are not checked for separator characters).}
\item{sep}{The separator between columns}
\item{eol}{Line separator}
\item{col.names}{A logical value indicating if the column names (header row) should be written}
\item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. Must be one of "escape", in which case the quote character (as well as the backslash character) is escaped in C style by a backslash, or "double" (default), in which case it is doubled.}
\item{block.size}{The output is written in blocks, each of which contains at most this number of rows. This is to avoid making large copies in memory. Can be used to tweak performance and memory usage.}
}
\details{
\code{NA} values are always represented by empty strings, quoted for character and factor columns if \code{quote=TRUE}.

The speed-up compared to \code{write.csv} depends on the parameters and column types. This version should be significantly faster if the table consists mainly of character columns. In addition, improved performance can be expected if one can make do with \code{quote=FALSE}.
}
\seealso{ \code{\link[utils]{write.csv}} }
\examples{
\dontrun{

fwrite(data.table(first=c(1,2), second=c(NA, 'foo"bar')), "table.csv")

# table.csv contains:

# "first","second"
# "1",""
# "2","foo""bar"
}
}
\keyword{ data }

59 changes: 59 additions & 0 deletions src/fwrite.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include <R.h>
#include <errno.h>
#include <Rinternals.h>

void writefile(SEXP list_of_columns,
SEXP filename,
SEXP col_sep_exp,
SEXP row_sep_exp,
SEXP quote_cols,
SEXP qmethod_escape_exp,
SEXP append) {

int error_number = 0;
int qmethod_escape = *LOGICAL(qmethod_escape_exp);

errno = 0; /* clear flag possibly set by previous errors */

char col_sep = *CHAR(STRING_ELT(col_sep_exp, 0));
const char *row_sep = CHAR(STRING_ELT(row_sep_exp, 0));
const char QUOTE_CHAR = '"';
const char ESCAPE_CHAR = '\\';

/* open input file in correct mode */
const char *open_mode = "wb";
if (*LOGICAL(append)) open_mode = "ab";
FILE *f = fopen(CHAR(STRING_ELT(filename, 0)), open_mode);
if (f == NULL) goto end;

R_xlen_t ncols = LENGTH(list_of_columns);
R_xlen_t nrows = LENGTH(VECTOR_ELT(list_of_columns, 0));

for (R_xlen_t row_i = 0; row_i < nrows; ++row_i) {
for (int col_i = 0; col_i < ncols; ++col_i) {

if (col_i > 0) fputc(col_sep, f);

int quote = LOGICAL(quote_cols)[col_i];

if (quote) fputc(QUOTE_CHAR, f);
for (const char *ch = CHAR(STRING_ELT(VECTOR_ELT(list_of_columns, col_i), row_i)); *ch != '\0'; ++ch) {
if (quote) {
if (*ch == QUOTE_CHAR) {
if (qmethod_escape) fputc(ESCAPE_CHAR, f);
else fputc(QUOTE_CHAR, f); /* qmethod = "double" */
}
if (qmethod_escape && *ch == ESCAPE_CHAR) fputc(ESCAPE_CHAR, f);
}
fputc(*ch, f);
}
if (quote) fputc(QUOTE_CHAR, f);
}
if (fputs(row_sep, f) < 0) goto end;
}

end:
error_number = errno;
if (f != NULL) fclose(f);
if (error_number) error(strerror(errno));
}
2 changes: 2 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ SEXP setcharvec();
SEXP setcolorder();
SEXP chmatchwrapper();
SEXP readfile();
SEXP writefile();
SEXP reorder();
SEXP rbindlist();
SEXP vecseq();
Expand Down Expand Up @@ -87,6 +88,7 @@ R_CallMethodDef callMethods[] = {
{"Csetcolorder", (DL_FUNC) &setcolorder, -1},
{"Cchmatchwrapper", (DL_FUNC) &chmatchwrapper, -1},
{"Creadfile", (DL_FUNC) &readfile, -1},
{"Cwritefile", (DL_FUNC) &writefile, -1},
{"Creorder", (DL_FUNC) &reorder, -1},
{"Crbindlist", (DL_FUNC) &rbindlist, -1},
{"Cvecseq", (DL_FUNC) &vecseq, -1},
Expand Down

0 comments on commit d794b3b

Please sign in to comment.