Rdatatable · mattdowle · May 9, 2021 · Sep 26, 2020 · Sep 26, 2020 · Sep 26, 2020
@@ -6,6 +6,13 @@
 
 ## NEW FEATURES
 
+1. `melt.data.table()` now supports `NA` entries when specifying a
+   list of `measure.vars`, which translate into runs of missing values
+   in the output. Fixes
+   [#4027](https://github.com/Rdatatable/data.table/issues/4027) via
+   [PR#4720](https://github.com/Rdatatable/data.table/pull/4720) from
+   @tdhock.
+
 ## BUG FIXES
 
 1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing.

@@ -17138,3 +17138,12 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]]))  # .NGRP doesn't chan
 test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x],  # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result
              data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) 
 
+# fix for #4027 via PR#4720.
+DT.wide = data.table(a2=2, b1=1, b2=2)
+expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2))
+test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected)
+test(2154.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected)
+DTid <- data.table(DT.wide, id=1)
+exid <- data.table(id=1, expected)
+test(2154.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid)
+test(2154.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid)
@@ -97,6 +97,21 @@ static const char *concat(SEXP vec, SEXP idx) {
   return ans;
 }
 
+// input: character vector of column names (maybe missing), output:
+// integer vector of column indices with NA_INTEGER in the positions
+// with missing inputs.
+SEXP chmatch_na(SEXP x, SEXP table){
+  SEXP ans;
+  PROTECT(ans = chmatch(x, table, 0));
+  for(int i=0; i<length(ans); i++){
+    if(STRING_ELT(x, i) == NA_STRING){
+      INTEGER(ans)[i] = NA_INTEGER;
+    }
+  }
+  UNPROTECT(1);
+  return ans;
+}
+
 // deal with measure.vars of type VECSXP
 SEXP measurelist(SEXP measure, SEXP dtnames) {
   const int n=length(measure);
@@ -105,7 +120,7 @@ SEXP measurelist(SEXP measure, SEXP dtnames) {
     SEXP x = VECTOR_ELT(measure, i);
     switch(TYPEOF(x)) {
       case STRSXP  :
-        SET_VECTOR_ELT(ans, i, chmatch(x, dtnames, 0));
+        SET_VECTOR_ELT(ans, i, chmatch_na(x, dtnames));
         break;
       case REALSXP :
         SET_VECTOR_ELT(ans, i, coerceVector(x, INTSXP));
@@ -138,6 +153,10 @@ static SEXP unlist_(SEXP xint) {
   return(ans);
 }
 
+bool invalid_measure(int i, int ncol) {
+  return i != NA_INTEGER && (i <= 0 || i > ncol);
+}
+
 SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
   int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0;
   SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans;
@@ -203,7 +222,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
     }
     booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++;
     for (i=0; i<length(tmp); i++) {
-      if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
+      if (invalid_measure(INTEGER(tmp)[i], ncol))
         error(_("One or more values in 'measure.vars' is invalid."));
       else if (!LOGICAL(booltmp)[i]) targetcols++;
       else continue;
@@ -249,7 +268,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
       tmp = PROTECT(unlist_(tmp2)); protecti++;
     }
     for (i=0; i<length(tmp); i++) {
-      if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
+      if (invalid_measure(INTEGER(tmp)[i], ncol)) 
         error(_("One or more values in 'measure.vars' is invalid."));
     }
     if (isNewList(measure)) valuecols = tmp2;
@@ -260,26 +279,35 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
   }
   ans = PROTECT(allocVector(VECSXP, 2)); protecti++;
   SET_VECTOR_ELT(ans, 0, idcols);
-  SET_VECTOR_ELT(ans, 1, valuecols);
+  SET_VECTOR_ELT(ans, 1, valuecols);//List of integer vectors.
   UNPROTECT(protecti);
   return(ans);
 }
 
 struct processData {
   SEXP RCHK;  // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865
-  SEXP idcols, valuecols, naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively
-  int lids, lvalues, lmax, lmin, totlen, nrow;
-  int *isfactor, *leach, *isidentical;
+  SEXP idcols, 
+    valuecols, // list with one element per output/value column, each
+	       // element is an integer vector.
+    naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively
+  int *isfactor,
+    *leach, // length of each element of the valuecols(measure.vars) list.
+    *isidentical; // are all inputs for this value column the same type?
+  int lids, // number of id columns.
+    lvalues, // number of value columns.
+    lmax,  //max length of valuecols elements / number of times to repeat ids.
+    totlen, // of output/long DT result of melt operation.
+    nrow; // of input/wide DT to be melted.
   SEXPTYPE *maxtype;
-  Rboolean narm;
+  Rboolean narm; // remove missing values?
 };
 
 static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) {
 
   SEXP vars,tmp,thiscol;
   SEXPTYPE type;
   int i,j;
-  data->lmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0));
+  data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0));
   SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose));
   data->idcols = VECTOR_ELT(vars, 0);
   data->valuecols = VECTOR_ELT(vars, 1);
@@ -296,29 +324,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna
   data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int));
   data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int));
   data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE));
-  for (i=0; i<data->lvalues; i++) {
+  // first find max type of each output column.
+  for (i=0; i<data->lvalues; i++) { // for each output column.
     tmp = VECTOR_ELT(data->valuecols, i);
     data->leach[i] = length(tmp);
     data->isidentical[i] = 1;  // TODO - why 1 and not Rboolean TRUE?
     data->isfactor[i] = 0;  // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable?
     data->maxtype[i] = 0;   // R_alloc doesn't initialize so careful to here, relied on below
     data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i];
-    data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i];
-    for (j=0; j<data->leach[i]; j++) {
-      thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1);
-      if (isFactor(thiscol)) {
-        data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1;
-        data->maxtype[i]  = STRSXP;
-      } else {
-        type = TYPEOF(thiscol);
-        if (type > data->maxtype[i]) data->maxtype[i] = type;
+    for (j=0; j<data->leach[i]; j++) { // for each input column.
+      int this_col_num = INTEGER(tmp)[j];
+      if(this_col_num != NA_INTEGER){
+	thiscol = VECTOR_ELT(DT, this_col_num-1);
+	if (isFactor(thiscol)) {
+	  data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1;
+	  data->maxtype[i]  = STRSXP;
+	} else {
+	  type = TYPEOF(thiscol);
+	  if (type > data->maxtype[i]) data->maxtype[i] = type;
+	}
       }
     }
     for (j=0; j<data->leach[i]; j++) {
-      thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1);
-      if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) {
-        data->isidentical[i] = 0;
-        break;
+      int this_col_num = INTEGER(tmp)[j];
+      if(this_col_num != NA_INTEGER){
+	thiscol = VECTOR_ELT(DT, this_col_num-1);
+	if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) ||
+	     (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) {
+	  data->isidentical[i] = 0;
+	  break;
+	}
       }
     }
   }
@@ -427,18 +462,25 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s
   SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax));
   Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean));
   SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues));
-  for (int i=0; i<data->lvalues; ++i) {
+  for (int i=0; i<data->lvalues; ++i) {//for each output/value column.
     bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor;
     SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy
     SET_VECTOR_ELT(ansvals, i, target);
     UNPROTECT(1);  // still protected by virtue of being member of protected ansval.
-    SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i);
+    SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids.
     int counter = 0;
     bool copyattr = false;
-    for (int j=0; j<data->lmax; ++j) {
+    for (int j=0; j<data->lmax; ++j) {// for each input column.
       int thisprotecti = 0;
-      SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1)
-                       : allocNAVector(data->maxtype[i], data->nrow);
+      // TODO use this line of code if NA specified.
+      SEXP thiscol;
+      int input_column_num = INTEGER(thisvaluecols)[j];
+      if (j >= data->leach[i] || // fewer indices than the max were specified.
+	  input_column_num == NA_INTEGER) { // NA was specified.
+	thiscol = allocNAVector(data->maxtype[i], data->nrow);
+      }else{
+	thiscol = VECTOR_ELT(DT, input_column_num-1);
+      }
       if (!copyattr && data->isidentical[i] && !data->isfactor[i]) {
         copyMostAttrib(thiscol, target);
         copyattr = true;