-
Notifications
You must be signed in to change notification settings - Fork 982
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
support missing values in measure.vars arg to melt #4720
Changes from 14 commits
cadfd09
522cc78
264169e
9a58c4c
0b69386
2a242e0
50b7b2e
6c8bb51
4c5810c
0bf6323
5725b25
854f189
e9322a7
02180c0
1f50aaf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -105,7 +105,7 @@ SEXP measurelist(SEXP measure, SEXP dtnames) { | |
SEXP x = VECTOR_ELT(measure, i); | ||
switch(TYPEOF(x)) { | ||
case STRSXP : | ||
SET_VECTOR_ELT(ans, i, chmatch(x, dtnames, 0)); | ||
SET_VECTOR_ELT(ans, i, chmatch(x, dtnames, NA_INTEGER)); | ||
break; | ||
case REALSXP : | ||
SET_VECTOR_ELT(ans, i, coerceVector(x, INTSXP)); | ||
|
@@ -138,6 +138,10 @@ static SEXP unlist_(SEXP xint) { | |
return(ans); | ||
} | ||
|
||
bool invalid_measure(int i, int ncol) { | ||
return i != NA_INTEGER && (i <= 0 || i > ncol); | ||
} | ||
|
||
SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { | ||
int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; | ||
SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans; | ||
|
@@ -203,7 +207,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { | |
} | ||
booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++; | ||
for (i=0; i<length(tmp); i++) { | ||
if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol) | ||
if (invalid_measure(INTEGER(tmp)[i], ncol)) | ||
error(_("One or more values in 'measure.vars' is invalid.")); | ||
else if (!LOGICAL(booltmp)[i]) targetcols++; | ||
else continue; | ||
|
@@ -249,7 +253,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { | |
tmp = PROTECT(unlist_(tmp2)); protecti++; | ||
} | ||
for (i=0; i<length(tmp); i++) { | ||
if (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol) | ||
if (invalid_measure(INTEGER(tmp)[i], ncol)) | ||
tdhock marked this conversation as resolved.
Show resolved
Hide resolved
|
||
error(_("One or more values in 'measure.vars' is invalid.")); | ||
} | ||
if (isNewList(measure)) valuecols = tmp2; | ||
|
@@ -260,26 +264,35 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { | |
} | ||
ans = PROTECT(allocVector(VECSXP, 2)); protecti++; | ||
SET_VECTOR_ELT(ans, 0, idcols); | ||
SET_VECTOR_ELT(ans, 1, valuecols); | ||
SET_VECTOR_ELT(ans, 1, valuecols);//List of integer vectors. | ||
UNPROTECT(protecti); | ||
return(ans); | ||
} | ||
|
||
struct processData { | ||
SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 | ||
SEXP idcols, valuecols, naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively | ||
int lids, lvalues, lmax, lmin, totlen, nrow; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see that one item ( |
||
int *isfactor, *leach, *isidentical; | ||
SEXP idcols, | ||
valuecols, // list with one element per output/value column, each | ||
// element is an integer vector. | ||
naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively | ||
int *isfactor, | ||
*leach, // length of each element of the valuecols(measure.vars) list. | ||
*isidentical; // are all inputs for this value column the same type? | ||
int lids, // number of id columns. | ||
lvalues, // number of value columns. | ||
lmax, //max length of valuecols elements / number of times to repeat ids. | ||
totlen, // of output/long DT result of melt operation. | ||
nrow; // of input/wide DT to be melted. | ||
SEXPTYPE *maxtype; | ||
Rboolean narm; | ||
Rboolean narm; // remove missing values? | ||
}; | ||
|
||
static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) { | ||
|
||
SEXP vars,tmp,thiscol; | ||
SEXPTYPE type; | ||
int i,j; | ||
data->lmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); | ||
data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); | ||
SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); | ||
data->idcols = VECTOR_ELT(vars, 0); | ||
data->valuecols = VECTOR_ELT(vars, 1); | ||
|
@@ -296,29 +309,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna | |
data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); | ||
data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); | ||
data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); | ||
for (i=0; i<data->lvalues; i++) { | ||
// first find max type of each output column. | ||
for (i=0; i<data->lvalues; i++) { // for each output column. | ||
tmp = VECTOR_ELT(data->valuecols, i); | ||
data->leach[i] = length(tmp); | ||
data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? | ||
data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? | ||
data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below | ||
data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; | ||
data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i]; | ||
for (j=0; j<data->leach[i]; j++) { | ||
thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); | ||
if (isFactor(thiscol)) { | ||
data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; | ||
data->maxtype[i] = STRSXP; | ||
} else { | ||
type = TYPEOF(thiscol); | ||
if (type > data->maxtype[i]) data->maxtype[i] = type; | ||
for (j=0; j<data->leach[i]; j++) { // for each input column. | ||
int this_col_num = INTEGER(tmp)[j]; | ||
if(this_col_num != NA_INTEGER){ | ||
thiscol = VECTOR_ELT(DT, this_col_num-1); | ||
if (isFactor(thiscol)) { | ||
data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; | ||
data->maxtype[i] = STRSXP; | ||
} else { | ||
type = TYPEOF(thiscol); | ||
if (type > data->maxtype[i]) data->maxtype[i] = type; | ||
} | ||
} | ||
} | ||
for (j=0; j<data->leach[i]; j++) { | ||
thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); | ||
if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { | ||
data->isidentical[i] = 0; | ||
break; | ||
int this_col_num = INTEGER(tmp)[j]; | ||
if(this_col_num != NA_INTEGER){ | ||
thiscol = VECTOR_ELT(DT, this_col_num-1); | ||
if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || | ||
(isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { | ||
data->isidentical[i] = 0; | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
|
@@ -392,6 +412,16 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType | |
return ans; | ||
} | ||
|
||
SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { | ||
if (in_col < data->leach[out_col]) { | ||
int input_column_num = INTEGER(thisvaluecols)[in_col]; | ||
if (input_column_num != NA_INTEGER) { | ||
return VECTOR_ELT(DT, input_column_num-1); | ||
} | ||
} | ||
return allocNAVector(data->maxtype[out_col], data->nrow); | ||
} | ||
|
||
SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { | ||
for (int i=0; i<data->lvalues; ++i) { | ||
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); | ||
|
@@ -407,12 +437,8 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s | |
for (int i=0; i<data->lmax; ++i) { | ||
SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); | ||
for (int j=0; j<data->lvalues; ++j) { | ||
if (i < data->leach[j]) { | ||
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); | ||
SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1)); | ||
} else { | ||
SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); | ||
} | ||
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); | ||
SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i)); | ||
} | ||
tmp = PROTECT(dt_na(tmp, seqcols)); | ||
SEXP w; | ||
|
@@ -427,18 +453,17 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s | |
SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax)); | ||
Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean)); | ||
SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues)); | ||
for (int i=0; i<data->lvalues; ++i) { | ||
for (int i=0; i<data->lvalues; ++i) {//for each output/value column. | ||
bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor; | ||
SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy | ||
SET_VECTOR_ELT(ansvals, i, target); | ||
UNPROTECT(1); // still protected by virtue of being member of protected ansval. | ||
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); | ||
SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids. | ||
int counter = 0; | ||
bool copyattr = false; | ||
for (int j=0; j<data->lmax; ++j) { | ||
for (int j=0; j<data->lmax; ++j) {// for each input column. | ||
int thisprotecti = 0; | ||
SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1) | ||
: allocNAVector(data->maxtype[i], data->nrow); | ||
SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j); | ||
if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { | ||
copyMostAttrib(thiscol, target); | ||
copyattr = true; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new
chmatch_na
function in this PR that I just removed I initially thought was necessary because this PR neededNA
s inx
not to match toNA
s intable
. So I started to addbool matchNAtoNA
argument tochmatchMain
. Then it turned out that just passingnomatch=NA_INTEGER
on this line (the only line wherechmatch_na
was used) instead ofnomatch=0
, seemed to work and passed the new tests. I leftmatchNAtoNA
as a comment inchmatchMain
in case we need to come back to that in future.@tdhock Please check and let me know if I got anything wrong here. If so, a new test is probably needed please.