Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support missing values in measure.vars arg to melt #4720

Merged
merged 15 commits into from
May 9, 2021
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@

## NEW FEATURES

1. `melt.data.table()` now supports `NA` entries when specifying a
list of `measure.vars`, which translate into runs of missing values
tdhock marked this conversation as resolved.
Show resolved Hide resolved
in the output. Fixes
[#4027](https://github.com/Rdatatable/data.table/issues/4027) via
[PR#4720](https://github.com/Rdatatable/data.table/pull/4720) from
@tdhock.

## BUG FIXES

1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing.
Expand Down
9 changes: 9 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -17138,3 +17138,12 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't chan
test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result
data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L))))

# fix for #4027 via PR#4720.
DT.wide = data.table(a2=2, b1=1, b2=2)
expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2))
test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected)
test(2154.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected)
DTid <- data.table(DT.wide, id=1)
exid <- data.table(id=1, expected)
test(2154.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid)
test(2154.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid)
98 changes: 70 additions & 28 deletions src/fmelt.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ static const char *concat(SEXP vec, SEXP idx) {
return ans;
}

// input: character vector of column names (maybe missing), output:
// integer vector of column indices with NA_INTEGER in the positions
// with missing inputs.
SEXP chmatch_na(SEXP x, SEXP table){
SEXP ans;
PROTECT(ans = chmatch(x, table, 0));
for(int i=0; i<length(ans); i++){
if(STRING_ELT(x, i) == NA_STRING){
INTEGER(ans)[i] = NA_INTEGER;
}
}
UNPROTECT(1);
return ans;
}

// deal with measure.vars of type VECSXP
SEXP measurelist(SEXP measure, SEXP dtnames) {
const int n=length(measure);
Expand All @@ -105,7 +120,7 @@ SEXP measurelist(SEXP measure, SEXP dtnames) {
SEXP x = VECTOR_ELT(measure, i);
switch(TYPEOF(x)) {
case STRSXP :
SET_VECTOR_ELT(ans, i, chmatch(x, dtnames, 0));
SET_VECTOR_ELT(ans, i, chmatch_na(x, dtnames));
break;
case REALSXP :
SET_VECTOR_ELT(ans, i, coerceVector(x, INTSXP));
Expand Down Expand Up @@ -138,6 +153,10 @@ static SEXP unlist_(SEXP xint) {
return(ans);
}

bool invalid_measure(int i, int ncol) {
return i != NA_INTEGER && (i <= 0 || i > ncol);
}

SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0;
SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans;
Expand Down Expand Up @@ -203,7 +222,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
}
booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++;
for (i=0; i<length(tmp); i++) {
if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
if (invalid_measure(INTEGER(tmp)[i], ncol))
error(_("One or more values in 'measure.vars' is invalid."));
else if (!LOGICAL(booltmp)[i]) targetcols++;
else continue;
Expand Down Expand Up @@ -249,7 +268,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
tmp = PROTECT(unlist_(tmp2)); protecti++;
}
for (i=0; i<length(tmp); i++) {
if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)
if (invalid_measure(INTEGER(tmp)[i], ncol))
tdhock marked this conversation as resolved.
Show resolved Hide resolved
error(_("One or more values in 'measure.vars' is invalid."));
}
if (isNewList(measure)) valuecols = tmp2;
Expand All @@ -260,26 +279,35 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) {
}
ans = PROTECT(allocVector(VECSXP, 2)); protecti++;
SET_VECTOR_ELT(ans, 0, idcols);
SET_VECTOR_ELT(ans, 1, valuecols);
SET_VECTOR_ELT(ans, 1, valuecols);//List of integer vectors.
UNPROTECT(protecti);
return(ans);
}

struct processData {
SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865
SEXP idcols, valuecols, naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively
int lids, lvalues, lmax, lmin, totlen, nrow;
Copy link
Member

@mattdowle mattdowle May 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that one item (lmin) has been removed from processData. But it looks like you've reviewed the items and commented them which is great (so you know this code better than I do). And no existing tests are changed, just new tests added. So I'm just logging as a comment that I spotted that removed item, and why it looks good to me.

int *isfactor, *leach, *isidentical;
SEXP idcols,
valuecols, // list with one element per output/value column, each
// element is an integer vector.
naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively
int *isfactor,
*leach, // length of each element of the valuecols(measure.vars) list.
*isidentical; // are all inputs for this value column the same type?
int lids, // number of id columns.
lvalues, // number of value columns.
lmax, //max length of valuecols elements / number of times to repeat ids.
totlen, // of output/long DT result of melt operation.
nrow; // of input/wide DT to be melted.
SEXPTYPE *maxtype;
Rboolean narm;
Rboolean narm; // remove missing values?
};

static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) {

SEXP vars,tmp,thiscol;
SEXPTYPE type;
int i,j;
data->lmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0));
data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0));
SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose));
data->idcols = VECTOR_ELT(vars, 0);
data->valuecols = VECTOR_ELT(vars, 1);
Expand All @@ -296,29 +324,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna
data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int));
data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int));
data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE));
for (i=0; i<data->lvalues; i++) {
// first find max type of each output column.
for (i=0; i<data->lvalues; i++) { // for each output column.
tmp = VECTOR_ELT(data->valuecols, i);
data->leach[i] = length(tmp);
data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE?
data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable?
data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below
data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i];
data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i];
for (j=0; j<data->leach[i]; j++) {
thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1);
if (isFactor(thiscol)) {
data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1;
data->maxtype[i] = STRSXP;
} else {
type = TYPEOF(thiscol);
if (type > data->maxtype[i]) data->maxtype[i] = type;
for (j=0; j<data->leach[i]; j++) { // for each input column.
int this_col_num = INTEGER(tmp)[j];
if(this_col_num != NA_INTEGER){
thiscol = VECTOR_ELT(DT, this_col_num-1);
if (isFactor(thiscol)) {
data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1;
data->maxtype[i] = STRSXP;
} else {
type = TYPEOF(thiscol);
if (type > data->maxtype[i]) data->maxtype[i] = type;
}
}
}
for (j=0; j<data->leach[i]; j++) {
thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1);
if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) {
data->isidentical[i] = 0;
break;
int this_col_num = INTEGER(tmp)[j];
if(this_col_num != NA_INTEGER){
thiscol = VECTOR_ELT(DT, this_col_num-1);
if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) ||
(isFactor(thiscol) && data->maxtype[i] != STRSXP) ) {
data->isidentical[i] = 0;
break;
}
}
}
}
Expand Down Expand Up @@ -427,18 +462,25 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s
SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax));
Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean));
SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues));
for (int i=0; i<data->lvalues; ++i) {
for (int i=0; i<data->lvalues; ++i) {//for each output/value column.
bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor;
SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy
SET_VECTOR_ELT(ansvals, i, target);
UNPROTECT(1); // still protected by virtue of being member of protected ansval.
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i);
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids.
int counter = 0;
bool copyattr = false;
for (int j=0; j<data->lmax; ++j) {
for (int j=0; j<data->lmax; ++j) {// for each input column.
int thisprotecti = 0;
SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1)
: allocNAVector(data->maxtype[i], data->nrow);
// TODO use this line of code if NA specified.
SEXP thiscol;
int input_column_num = INTEGER(thisvaluecols)[j];
if (j >= data->leach[i] || // fewer indices than the max were specified.
input_column_num == NA_INTEGER) { // NA was specified.
thiscol = allocNAVector(data->maxtype[i], data->nrow);
}else{
thiscol = VECTOR_ELT(DT, input_column_num-1);
}
if (!copyattr && data->isidentical[i] && !data->isfactor[i]) {
copyMostAttrib(thiscol, target);
copyattr = true;
Expand Down