Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gzip support to fwrite #3278

Closed
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
dateTimeAs = c("ISO","squash","epoch","write.csv"),
buffMB=8, nThread=getDTthreads(verbose),
showProgress=getOption("datatable.showProgress", interactive()),
verbose=getOption("datatable.verbose", FALSE)) {
compress = c("auto", "none", "gzip"),
verbose=getOption("datatable.verbose", FALSE)
) {
isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R?
na = as.character(na[1L]) # fix for #1725
if (missing(qmethod)) qmethod = qmethod[1L]
if (missing(compress)) compress = compress[1L]
if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] }
else if (length(dateTimeAs)>1L) stop("dateTimeAs must be a single string")
dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L
Expand Down Expand Up @@ -38,13 +41,17 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present
is.character(eol) && length(eol)==1L,
length(qmethod) == 1L && qmethod %chin% c("double", "escape"),
length(compress) == 1L && compress %chin% c("auto", "none", "gzip"),
isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names),
isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01),
length(na) == 1L, #1725, handles NULL or character(0) input
is.character(file) && length(file)==1L && !is.na(file),
length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024,
length(nThread)==1L && !is.na(nThread) && nThread>=1L
)

is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))

file <- path.expand(file) # "~/foo/bar"
if (append && missing(col.names) && (file=="" || file.exists(file)))
col.names = FALSE # test 1658.16 checks this
Expand All @@ -71,7 +78,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
.Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
showProgress, verbose)
showProgress, is_gzip, verbose)
invisible()
}

21 changes: 21 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9349,6 +9349,27 @@ test(1658.34, fwrite(matrix(1:4, nrow=2, ncol=2), quote = TRUE), output = '"V1",
test(1658.35, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\n.*1\n2\n3', message = "x being coerced from class: matrix to data.table")
test(1658.36, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table")

# fwrite output to console ignore compress
test(1658.37, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"),
output='a,b\n1,1\n2,2\n3,3')

# fwrite force gzipped output
if (.Platform$OS.type=="unix") {
f <- tempfile()
fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="gzip")
test(1658.38, system(paste("zcat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is manual check that will make those line (and next test too) fail, related to T instead of TRUE, see

# No T or F symbols in tests.Rraw. 24 valid F (quoted, column name or in data) and 1 valid T at the time of writing

unlink(f)
}


# fwrite force csv output
if (.Platform$OS.type=="unix") {
f <- tempfile()
fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="none")
test(1658.39, system(paste("cat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"')
unlink(f)
}

## End fwrite tests

# tests for #679, inrange(), FR #707
Expand Down
2 changes: 2 additions & 0 deletions man/fwrite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
dateTimeAs = c("ISO","squash","epoch","write.csv"),
buffMB = 8L, nThread = getDTthreads(verbose),
showProgress = getOption("datatable.showProgress", interactive()),
compress = c("default", "none", "gzip"),
verbose = getOption("datatable.verbose", FALSE))
}
\arguments{
Expand Down Expand Up @@ -52,6 +53,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
\item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.}
\item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
\item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
\item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
\item{verbose}{Be chatty and report timings?}
}
\details{
Expand Down
82 changes: 65 additions & 17 deletions src/fwrite.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <math.h> // isfinite, isnan
#include <stdlib.h> // abs
#include <string.h> // strlen, strerror

#ifdef WIN32
#include <sys/types.h>
#include <sys/stat.h>
Expand All @@ -17,6 +18,8 @@
#define WRITE write
#define CLOSE close
#endif

#include "zlib.h" // for writing gzip file
#include "myomp.h"
#include "fwrite.h"

Expand Down Expand Up @@ -643,27 +646,45 @@ void fwriteMain(fwriteMainArgs args)
maxLineLen += eolLen;
if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0));

int f;
int f=0;
gzFile zf=NULL;
int err;
if (*args.filename=='\0') {
f=-1; // file="" means write to standard output
args.is_gzip = false; // gzip is only for file
// eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows
} else {
} else if (!args.is_gzip) {
#ifdef WIN32
f = _open(args.filename, _O_WRONLY | _O_BINARY | _O_CREAT | (args.append ? _O_APPEND : _O_TRUNC), _S_IWRITE);
// O_BINARY rather than O_TEXT for explicit control and speed since it seems that write() has a branch inside it
// to convert \n to \r\n on Windows when in text mode not not when in binary mode.
#else
f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666);
// There is no binary/text mode distinction on Linux and Mac
#endif
if (f == -1) {
int erropen = errno;
STOP(access( args.filename, F_OK ) != -1 ?
"%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" :
"%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?",
strerror(erropen), args.filename);
}
} else {
#endif
zf = gzopen(args.filename, "wb");
if (zf == NULL) {
int erropen = errno;
STOP(access( args.filename, F_OK ) != -1 ?
"%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" :
"%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?",
strerror(erropen), args.filename);
}
// alloc gzip buffer : buff + 10% + 16
size_t buffzSize = (size_t)(1024*1024*buffMB + 1024*1024*buffMB / 10 + 16);
if (gzbuffer(zf, buffzSize)) {
STOP("Error allocate buffer for gzip file");
}
}

t0=wallclock();

if (args.verbose) {
Expand All @@ -683,32 +704,50 @@ void fwriteMain(fwriteMainArgs args)
}
for (int j=0; j<args.ncol; j++) {
writeString(args.colNames, j, &ch);
if (f==-1) {
*ch = '\0';
DTPRINT(buff);
} else if (WRITE(f, buff, (int)(ch-buff))==-1) { // TODO: move error check inside WRITE
int errwrite=errno; // capture write errno now incase close fails with a different errno
close(f);
free(buff);
STOP("%s: '%s'", strerror(errwrite), args.filename);
if(!args.is_gzip) {
if (f==-1) {
*ch = '\0';
DTPRINT(buff);
} else if (WRITE(f, buff, (int)(ch-buff)) == -1) { // TODO: move error check inside WRITE
int errwrite=errno; // capture write errno now incase close fails with a different errno
CLOSE(f);
free(buff);
STOP("%s: '%s'", strerror(errwrite), args.filename);
}
} else {
if ((!gzwrite(zf, buff, (int)(ch-buff)))) {
int errwrite=gzclose(zf);
free(buff);
STOP("Error gzwrite %d: %s", errwrite, args.filename);
}
}

ch = buff; // overwrite column names at the start in case they are > 1 million bytes long
*ch++ = args.sep; // this sep after the last column name won't be written to the file
}
if (f==-1) {
DTPRINT(args.eol);
} else if (WRITE(f, args.eol, eolLen)==-1) {
} else if (!args.is_gzip && WRITE(f, args.eol, eolLen)==-1) {
int errwrite=errno;
close(f);
CLOSE(f);
free(buff);
STOP("%s: '%s'", strerror(errwrite), args.filename);
} else if (args.is_gzip && (!gzwrite(zf, args.eol, eolLen))) {
int errwrite=gzclose(zf);
free(buff);
STOP("Error gzwrite %d: %s", errwrite, args.filename);
}

}
free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above
if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0));
if (args.nrow == 0) {
if (args.verbose) DTPRINT("No data rows present (nrow==0)\n");
if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename);
if (args.is_gzip) {
if ( (err = gzclose(zf)) ) STOP("gzclose error %d: '%s'", err, args.filename);
} else {
if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename);
}
return;
}

Expand Down Expand Up @@ -815,8 +854,10 @@ void fwriteMain(fwriteMainArgs args)
// by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console
// to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast.
} else {
if (WRITE(f, myBuff, (int)(ch-myBuff)) == -1) {
if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) {
failed=errno;
} else if (args.is_gzip && (!gzwrite(zf, myBuff, (int)(ch-myBuff)))) {
gzerror(zf, &failed);
}
if (myAlloc > buffSize) anyBufferGrown = true;
int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB
Expand Down Expand Up @@ -873,8 +914,15 @@ void fwriteMain(fwriteMainArgs args)
DTPRINT("\n");
}
}
if (f!=-1 && CLOSE(f) && !failed)
STOP("%s: '%s'", strerror(errno), args.filename);

if (!args.is_gzip) {
if (f!=-1 && CLOSE(f) && !failed)
STOP("%s: '%s'", strerror(errno), args.filename);
} else {
if ( (err=gzclose(zf)) ) {
STOP("gzclose error %d: '%s'", err, args.filename);
}
}
// quoted '%s' in case of trailing spaces in the filename
// If a write failed, the line above tries close() to clean up, but that might fail as well. So the
// '&& !failed' is to not report the error as just 'closing file' but the next line for more detail
Expand Down
19 changes: 1 addition & 18 deletions src/fwrite.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,10 @@ typedef struct fwriteMainArgs
// contains non-ASCII characters, it should be UTF-8 encoded (however fread
// will not validate the encoding).
const char *filename;

int ncol;

int64_t nrow;

// a vector of pointers to all-same-length column vectors
void **columns;

writer_fun_t *funs; // a vector of writer_fun_t function pointers

// length ncol vector containing which fun[] to use for each column
Expand All @@ -48,19 +44,12 @@ typedef struct fwriteMainArgs
uint8_t *whichFun;

void *colNames; // NULL means no header, otherwise ncol strings

bool doRowNames; // optional, likely false

void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output.

char sep;

char sep2;

char dec;

const char *eol;

const char *na;

// The quote character is always " (ascii 34) and cannot be changed since nobody on Earth uses a different quoting character, surely
Expand All @@ -69,19 +58,13 @@ typedef struct fwriteMainArgs
int8_t doQuote;

bool qmethodEscape; // true means escape quotes using backslash, else double-up double quotes.

bool squashDateTime;

bool append;

int buffMB; // [1-1024] default 8MB

int nth;

bool showProgress;

bool verbose;

bool is_gzip;
} fwriteMainArgs;

void fwriteMain(fwriteMainArgs args);
Expand Down
6 changes: 4 additions & 2 deletions src/fwriteR.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

#include <stdbool.h>
#include "data.table.h"
#include "fwrite.h"
Expand Down Expand Up @@ -128,10 +127,13 @@ SEXP fwriteR(
SEXP buffMB_Arg, // [1-1024] default 8MB
SEXP nThread_Arg,
SEXP showProgress_Arg,
SEXP verbose_Arg)
SEXP is_gzip_Arg,
SEXP verbose_Arg
)
{
if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table");
fwriteMainArgs args;
args.is_gzip = LOGICAL(is_gzip_Arg)[0];
args.verbose = LOGICAL(verbose_Arg)[0];
args.filename = CHAR(STRING_ELT(filename_Arg, 0));
args.ncol = length(DF);
Expand Down