support missing values in measure.vars arg to melt (#4720)

Rdatatable · May 9, 2021 · ebc14ce · ebc14ce
1 parent ebc5bc3
commit ebc14ce
Show file tree

Hide file tree

Showing 5 changed files with 97 additions and 50 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -54,6 +54,8 @@
 
 7. `fwrite()` gains a new `datatable.fwrite.sep` option to change the default separator, still `","` by default. Thanks to Tony Fischetti for the PR. As is good practice in R in general, we usually resist new global options for the reason that a user changing the option for their own code can inadvertently change the behaviour of any package using `data.table` too. However, in this case, the global option affects file output rather than code behaviour. In fact, the very reason the user may wish to change the default separator is that they know a different separator is more appropriate for their data being passed to the package using `fwrite` but cannot otherwise change the `fwrite` call within that package.
 
+8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing.
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -17424,3 +17424,14 @@ test(2180, DT[, a:=NULL], data.table(b=2, a=3))
 
 # as.data.table(table(NULL)) was error, #4179
 test(2181, as.data.table(table(NULL)), data.table(NULL))
+
+# some missing variables in melt, #4027
+DT.wide = data.table(a2=2, b1=1, b2=2)
+expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2))
+test(2182.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected)
+test(2182.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected)
+DTid = data.table(DT.wide, id=1)
+exid = data.table(id=1, expected)
+test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid)
+test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid)
+test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455
diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd
@@ -64,7 +64,11 @@ effect.
 
 From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars}
 accepting a list of \code{character} or \code{integer} vectors as well to melt
-into multiple columns in a single function call efficiently. The function
+into multiple columns in a single function call efficiently.
+If a vector in the list contains missing values, or is shorter than the
+max length of the list elements, then the output will include runs of
+missing values at the specified position, or at the end.
+The function
 \code{\link{patterns}} can be used to provide regular expression patterns. When
 used along with \code{melt}, if \code{cols} argument is not provided, the
 patterns will be matched against \code{names(data)}, for convenience.
@@ -134,6 +138,10 @@ melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE)
 # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column
 melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE)
 
+# measure list with missing/short entries results in output with runs of NA
+DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)]
+melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2")))
+
 }
 \seealso{
   \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape}

diff --git a/src/chmatch.c b/src/chmatch.c
@@ -74,11 +74,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
   }
   int nuniq=0;
   for (int i=0; i<tablelen; ++i) {
-    SEXP s = td[i];
+    const SEXP s = td[i];
     int tl = TRUELENGTH(s);
     if (tl>0) { savetl(s); tl=0; }
     if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
   }
+  // in future if we need NAs in x not to be matched to NAs in table ...
+  // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0)
+  //   SET_TRUELENGTH(NA_STRING, 0);
   if (chmatchdup) {
     // chmatchdup() is basically base::pmatch() but without the partial matching part. For example :
     //   chmatchdup(c("a", "a"), c("a", "a"))   # 1,2  - the second 'a' in 'x' has a 2nd match in 'table'
@@ -107,7 +110,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
     for (int i=0; i<xlen; ++i) {
       int u = TRUELENGTH(xd[i]);
       if (u<0) {
-        int w = counts[-u-1]++;
+        const int w = counts[-u-1]++;
         if (map[w]) { ansd[i]=map[w]; continue; }
         SET_TRUELENGTH(xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
         // we still need the 0-setting loop at the end of this function because often there will be some values in table that are not matched to at all.
@@ -122,7 +125,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
     }
   } else {
     for (int i=0; i<xlen; i++) {
-      int m = TRUELENGTH(xd[i]);
+      const int m = TRUELENGTH(xd[i]);
       ansd[i] = (m<0) ? -m : nomatch;
     }
   }

diff --git a/src/fmelt.c b/src/fmelt.c
@@ -105,7 +105,7 @@ SEXP measurelist(SEXP measure, SEXP dtnames) {
     SEXP x = VECTOR_ELT(measure, i);
     switch(TYPEOF(x)) {
       case STRSXP  :
-        SET_VECTOR_ELT(ans, i, chmatch(x, dtnames, 0));
+        SET_VECTOR_ELT(ans, i, chmatch(x, dtnames, NA_INTEGER));
         break;
       case REALSXP :
         SET_VECTOR_ELT(ans, i, coerceVector(x, INTSXP));
@@ -138,19 +138,23 @@ static SEXP unlist_(SEXP xint) {
   return(ans);
 }
 
+bool invalid_measure(int i, int ncol) {
+  return (i<=0 && i!=NA_INTEGER) || i>ncol;
+}
+
 SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
-  int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0;
+  int ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0;
   SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans;
   SEXP dtnames = PROTECT(getAttrib(DT, R_NamesSymbol)); protecti++;
 
   if (isNull(id) && isNull(measure)) {
-    for (i=0; i<ncol; i++) {
+    for (int i=0; i<ncol; ++i) {
       thiscol = VECTOR_ELT(DT, i);
       if ((isInteger(thiscol) || isNumeric(thiscol) || isLogical(thiscol)) && !isFactor(thiscol)) targetcols++;
     }
     idcols = PROTECT(allocVector(INTSXP, ncol-targetcols)); protecti++;
     tmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++;
-    for (i=0; i<ncol; i++) {
+    for (int i=0; i<ncol; ++i) {
       thiscol = VECTOR_ELT(DT, i);
       if ((isInteger(thiscol) || isNumeric(thiscol) || isLogical(thiscol)) && !isFactor(thiscol)) {
         INTEGER(tmp)[u++] = i+1;
@@ -168,15 +172,15 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
       default : error(_("Unknown 'id.vars' type %s, must be character or integer vector"), type2char(TYPEOF(id)));
     }
     booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++;
-    for (i=0; i<length(tmp); i++) {
+    for (int i=0; i<length(tmp); ++i) {
       if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
         error(_("One or more values in 'id.vars' is invalid."));
       else if (!LOGICAL(booltmp)[i]) targetcols++;
       else continue;
     }
     unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++;
     u = 0;
-    for (i=0; i<length(booltmp); i++) {
+    for (int i=0; i<length(booltmp); ++i) {
       if (!LOGICAL(booltmp)[i]) {
         INTEGER(unqtmp)[u++] = INTEGER(tmp)[i];
       }
@@ -202,15 +206,15 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
       tmp = PROTECT(unlist_(tmp2)); protecti++;
     }
     booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++;
-    for (i=0; i<length(tmp); i++) {
-      if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
+    for (int i=0; i<length(tmp); ++i) {
+      if (invalid_measure(INTEGER(tmp)[i], ncol))
         error(_("One or more values in 'measure.vars' is invalid."));
       else if (!LOGICAL(booltmp)[i]) targetcols++;
       else continue;
     }
     unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++;
     u = 0;
-    for (i=0; i<length(booltmp); i++) {
+    for (int i=0; i<length(booltmp); ++i) {
       if (!LOGICAL(booltmp)[i]) {
         INTEGER(unqtmp)[u++] = INTEGER(tmp)[i];
       }
@@ -232,7 +236,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
       case INTSXP  : tmp = id; break;
       default : error(_("Unknown 'id.vars' type %s, must be character or integer vector"), type2char(TYPEOF(id)));
     }
-    for (i=0; i<length(tmp); i++) {
+    for (int i=0; i<length(tmp); ++i) {
       if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
         error(_("One or more values in 'id.vars' is invalid."));
     }
@@ -248,8 +252,8 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
     if (isNewList(measure)) {
       tmp = PROTECT(unlist_(tmp2)); protecti++;
     }
-    for (i=0; i<length(tmp); i++) {
-      if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
+    for (int i=0; i<length(tmp); ++i) {
+      if (invalid_measure(INTEGER(tmp)[i], ncol)) 
         error(_("One or more values in 'measure.vars' is invalid."));
     }
     if (isNewList(measure)) valuecols = tmp2;
@@ -260,26 +264,33 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
   }
   ans = PROTECT(allocVector(VECSXP, 2)); protecti++;
   SET_VECTOR_ELT(ans, 0, idcols);
-  SET_VECTOR_ELT(ans, 1, valuecols);
+  SET_VECTOR_ELT(ans, 1, valuecols);//List of integer vectors.
   UNPROTECT(protecti);
   return(ans);
 }
 
 struct processData {
-  SEXP RCHK;  // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865
-  SEXP idcols, valuecols, naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively
-  int lids, lvalues, lmax, lmin, totlen, nrow;
-  int *isfactor, *leach, *isidentical;
+  SEXP RCHK;      // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865
+  SEXP idcols,    // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively
+    valuecols,    // list with one element per output/value column, each element is an integer vector.
+    naidx; 
+  int *isfactor,
+    *leach,       // length of each element of the valuecols(measure.vars) list.
+    *isidentical; // are all inputs for this value column the same type?
+  int lids,       // number of id columns.
+    lvalues,      // number of value columns.
+    lmax,         // max length of valuecols elements / number of times to repeat ids.
+    totlen,       // of output/long DT result of melt operation.
+    nrow;         // of input/wide DT to be melted.
   SEXPTYPE *maxtype;
-  Rboolean narm;
+  Rboolean narm;  // remove missing values?
 };
 
 static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) {
 
   SEXP vars,tmp,thiscol;
   SEXPTYPE type;
-  int i,j;
-  data->lmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0));
+  data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0));
   SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose));
   data->idcols = VECTOR_ELT(vars, 0);
   data->valuecols = VECTOR_ELT(vars, 1);
@@ -296,29 +307,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna
   data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int));
   data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int));
   data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE));
-  for (i=0; i<data->lvalues; i++) {
+  // first find max type of each output column.
+  for (int i=0; i<data->lvalues; ++i) { // for each output column.
     tmp = VECTOR_ELT(data->valuecols, i);
     data->leach[i] = length(tmp);
     data->isidentical[i] = 1;  // TODO - why 1 and not Rboolean TRUE?
     data->isfactor[i] = 0;  // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable?
     data->maxtype[i] = 0;   // R_alloc doesn't initialize so careful to here, relied on below
     data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i];
-    data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i];
-    for (j=0; j<data->leach[i]; j++) {
-      thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1);
-      if (isFactor(thiscol)) {
-        data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1;
-        data->maxtype[i]  = STRSXP;
-      } else {
-        type = TYPEOF(thiscol);
-        if (type > data->maxtype[i]) data->maxtype[i] = type;
+    for (int j=0; j<data->leach[i]; ++j) { // for each input column.
+      int this_col_num = INTEGER(tmp)[j];
+      if(this_col_num != NA_INTEGER){
+        thiscol = VECTOR_ELT(DT, this_col_num-1);
+        if (isFactor(thiscol)) {
+          data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1;
+          data->maxtype[i]  = STRSXP;
+        } else {
+          type = TYPEOF(thiscol);
+          if (type > data->maxtype[i]) data->maxtype[i] = type;
+        }
       }
     }
-    for (j=0; j<data->leach[i]; j++) {
-      thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1);
-      if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) {
-        data->isidentical[i] = 0;
-        break;
+    for (int j=0; j<data->leach[i]; ++j) {
+      int this_col_num = INTEGER(tmp)[j];
+      if(this_col_num != NA_INTEGER){
+        thiscol = VECTOR_ELT(DT, this_col_num-1);
+        if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) ||
+             (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) {
+          data->isidentical[i] = 0;
+          break;
+        }
       }
     }
   }
@@ -392,6 +410,16 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType
   return ans;
 }
 
+SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) {
+  if (in_col < data->leach[out_col]) {
+    int input_column_num = INTEGER(thisvaluecols)[in_col];
+    if (input_column_num != NA_INTEGER) {
+      return VECTOR_ELT(DT, input_column_num-1);
+    }
+  }
+  return allocNAVector(data->maxtype[out_col], data->nrow);
+}  
+
 SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) {
   for (int i=0; i<data->lvalues; ++i) {
     SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i);
@@ -407,12 +435,8 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s
     for (int i=0; i<data->lmax; ++i) {
       SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues));
       for (int j=0; j<data->lvalues; ++j) {
-        if (i < data->leach[j]) {
-          SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j);
-          SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1));
-        } else {
-          SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow));
-        }
+        SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j);
+        SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i));
       }
       tmp = PROTECT(dt_na(tmp, seqcols));
       SEXP w;
@@ -427,18 +451,17 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s
   SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax));
   Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean));
   SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues));
-  for (int i=0; i<data->lvalues; ++i) {
+  for (int i=0; i<data->lvalues; ++i) {//for each output/value column.
     bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor;
     SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy
     SET_VECTOR_ELT(ansvals, i, target);
     UNPROTECT(1);  // still protected by virtue of being member of protected ansval.
-    SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i);
+    SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids.
     int counter = 0;
     bool copyattr = false;
-    for (int j=0; j<data->lmax; ++j) {
+    for (int j=0; j<data->lmax; ++j) {// for each input column.
       int thisprotecti = 0;
-      SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1)
-                       : allocNAVector(data->maxtype[i], data->nrow);
+      SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j);
       if (!copyattr && data->isidentical[i] && !data->isfactor[i]) {
         copyMostAttrib(thiscol, target);
         copyattr = true;