diff --git a/NEWS.md b/NEWS.md index 398819815..11250c6c9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -54,6 +54,8 @@ 7. `fwrite()` gains a new `datatable.fwrite.sep` option to change the default separator, still `","` by default. Thanks to Tony Fischetti for the PR. As is good practice in R in general, we usually resist new global options for the reason that a user changing the option for their own code can inadvertently change the behaviour of any package using `data.table` too. However, in this case, the global option affects file output rather than code behaviour. In fact, the very reason the user may wish to change the default separator is that they know a different separator is more appropriate for their data being passed to the package using `fwrite` but cannot otherwise change the `fwrite` call within that package. +8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f3bb5eb3..e13fea88c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17424,3 +17424,14 @@ test(2180, DT[, a:=NULL], data.table(b=2, a=3)) # as.data.table(table(NULL)) was error, #4179 test(2181, as.data.table(table(NULL)), data.table(NULL)) + +# some missing variables in melt, #4027 +DT.wide = data.table(a2=2, b1=1, b2=2) +expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) +test(2182.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) +test(2182.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) +DTid = data.table(DT.wide, id=1) +exid = data.table(id=1, expected) +test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) +test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) +test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index e51c61aaa..5ff25005d 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -64,7 +64,11 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt -into multiple columns in a single function call efficiently. The function +into multiple columns in a single function call efficiently. +If a vector in the list contains missing values, or is shorter than the +max length of the list elements, then the output will include runs of +missing values at the specified position, or at the end. +The function \code{\link{patterns}} can be used to provide regular expression patterns. When used along with \code{melt}, if \code{cols} argument is not provided, the patterns will be matched against \code{names(data)}, for convenience. @@ -134,6 +138,10 @@ melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) +# measure list with missing/short entries results in output with runs of NA +DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] +melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) + } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} diff --git a/src/chmatch.c b/src/chmatch.c index f80e7dd2c..a091e646f 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -74,11 +74,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } + // in future if we need NAs in x not to be matched to NAs in table ... + // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0) + // SET_TRUELENGTH(NA_STRING, 0); if (chmatchdup) { // chmatchdup() is basically base::pmatch() but without the partial matching part. For example : // chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' @@ -107,7 +110,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch for (int i=0; incol; +} + SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { - int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; + int ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans; SEXP dtnames = PROTECT(getAttrib(DT, R_NamesSymbol)); protecti++; if (isNull(id) && isNull(measure)) { - for (i=0; i ncol) error(_("One or more values in 'id.vars' is invalid.")); else if (!LOGICAL(booltmp)[i]) targetcols++; @@ -176,7 +180,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; u = 0; - for (i=0; i ncol) + for (int i=0; i ncol) error(_("One or more values in 'id.vars' is invalid.")); } @@ -248,8 +252,8 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { if (isNewList(measure)) { tmp = PROTECT(unlist_(tmp2)); protecti++; } - for (i=0; i ncol) + for (int i=0; ilmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); + data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); @@ -296,29 +307,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); - for (i=0; ilvalues; i++) { + // first find max type of each output column. + for (int i=0; ilvalues; ++i) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; - data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i]; - for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if (isFactor(thiscol)) { - data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; - data->maxtype[i] = STRSXP; - } else { - type = TYPEOF(thiscol); - if (type > data->maxtype[i]) data->maxtype[i] = type; + for (int j=0; jleach[i]; ++j) { // for each input column. + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if (isFactor(thiscol)) { + data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; + data->maxtype[i] = STRSXP; + } else { + type = TYPEOF(thiscol); + if (type > data->maxtype[i]) data->maxtype[i] = type; + } } } - for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { - data->isidentical[i] = 0; - break; + for (int j=0; jleach[i]; ++j) { + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || + (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { + data->isidentical[i] = 0; + break; + } } } } @@ -392,6 +410,16 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType return ans; } +SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { + if (in_col < data->leach[out_col]) { + int input_column_num = INTEGER(thisvaluecols)[in_col]; + if (input_column_num != NA_INTEGER) { + return VECTOR_ELT(DT, input_column_num-1); + } + } + return allocNAVector(data->maxtype[out_col], data->nrow); +} + SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { for (int i=0; ilvalues; ++i) { SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); @@ -407,12 +435,8 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int i=0; ilmax; ++i) { SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (int j=0; jlvalues; ++j) { - if (i < data->leach[j]) { - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1)); - } else { - SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); - } + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); + SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i)); } tmp = PROTECT(dt_na(tmp, seqcols)); SEXP w; @@ -427,18 +451,17 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax)); Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean)); SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues)); - for (int i=0; ilvalues; ++i) { + for (int i=0; ilvalues; ++i) {//for each output/value column. bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor; SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy SET_VECTOR_ELT(ansvals, i, target); UNPROTECT(1); // still protected by virtue of being member of protected ansval. - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids. int counter = 0; bool copyattr = false; - for (int j=0; jlmax; ++j) { + for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1) - : allocNAVector(data->maxtype[i], data->nrow); + SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j); if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = true;