Skip to content

Commit

Permalink
doc_id to factor in data.frame output
Browse files Browse the repository at this point in the history
  • Loading branch information
junhewk committed Mar 31, 2024
1 parent dd3be5e commit 6a5cce5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 39 deletions.
26 changes: 8 additions & 18 deletions src/posParallelRcpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,29 +196,22 @@ DataFrame posParallelDFRcpp( StringVector text, std::string sys_dic, std::string
std::vector< std::vector < std::string > > results(text.size());
std::vector< std::string > input = as<std::vector< std::string > >(text);

StringVector doc_id;
IntegerVector doc_id;
IntegerVector sentence_id;
IntegerVector token_id;
StringVector token;
StringVector pos;
StringVector subtype;
StringVector analytic;

String doc_id_t;
String token_t;
String pos_t;
String subtype_t;
String analytic_t;

int doc_number = 0;
int doc_number = 1;
int sentence_number = 1;
int token_number = 1;
StringVector text_names;
bool b = text.hasAttribute("name");

if (b == TRUE) {
text_names = text.names();
}

std::vector<std::string> arguments = {"--dicdir", sys_dic, "--userdic", user_dic};

Expand All @@ -234,7 +227,7 @@ DataFrame posParallelDFRcpp( StringVector text, std::string sys_dic, std::string
return R_NilValue;
}

// parallel argorithm with Intell TBB
// parallel argorithm with Intel TBB
// RcppParallel doesn't get CharacterVector as input and output
TextParseDF func = TextParseDF(&input, results, model);
tbb::parallel_for(tbb::blocked_range<size_t>(0, input.size()), func);
Expand Down Expand Up @@ -278,20 +271,17 @@ DataFrame posParallelDFRcpp( StringVector text, std::string sys_dic, std::string
}

// append doc_id
if (b == TRUE) {
doc_id_t = text_names[doc_number];
doc_id_t.set_encoding(CE_UTF8);
doc_id.push_back(doc_id_t);
} else {
doc_id.push_back(std::to_string(doc_number + 1));
}

doc_id.push_back(doc_number);
}
sentence_number = 1;
token_number = 1;
doc_number++;
}

// doc_id to factor
doc_id.attr("class") = "factor";
doc_id.attr("levels") = text;

return DataFrame::create(_["doc_id"]=doc_id, _["sentence_id"]=sentence_id, _["token_id"]=token_id, _["token"]=token, _["pos"]=pos, _["subtype"]=subtype, _["analytic"]=analytic);
}

Expand Down
34 changes: 13 additions & 21 deletions src/posloopRcpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,28 +144,22 @@ DataFrame posDFRcpp(StringVector text, std::string sys_dic, std::string user_dic
// lattice model
StringVector::iterator it;

StringVector doc_id;
IntegerVector doc_id;
IntegerVector sentence_id;
IntegerVector token_id;
StringVector token;
StringVector pos;
StringVector subtype;
StringVector analytic;
CharacterVector token;
CharacterVector pos;
CharacterVector subtype;
CharacterVector analytic;

String doc_id_t;
String token_t;
String pos_t;
String subtype_t;
String analytic_t;

int doc_number = 0;
int doc_number = 1;
int sentence_number = 1;
int token_number = 1;
StringVector text_names;
bool b = text.hasAttribute("name");
if (b == TRUE) {
text_names = text.names();
}

std::vector<std::string> arguments = {"--dicdir", sys_dic, "--userdic", user_dic};

Expand Down Expand Up @@ -239,25 +233,23 @@ DataFrame posDFRcpp(StringVector text, std::string sys_dic, std::string user_dic
token_number = 1;
}

// append doc_id
if (b == TRUE) {
doc_id_t = text_names[doc_number];
doc_id_t.set_encoding(CE_UTF8);
doc_id.push_back(doc_id_t);
} else {
doc_id.push_back(std::to_string(doc_number + 1));
}
// append doc_id as int
doc_id.push_back(doc_number);
}
}
sentence_number = 1;
token_number = 1;
doc_number++;
}

// doc_id to factor
doc_id.attr("class") = "factor";
doc_id.attr("levels") = text;

// gc
delete lattice;
delete tagger;
delete model;

return DataFrame::create(_["doc_id"]=doc_id, _["sentence_id"]=sentence_id, _["token_id"]=token_id, _["token"]=token, _["pos"]=pos, _["subtype"]=subtype, _["analytic"]=analytic);
return DataFrame::create(Named("doc_id")=doc_id, Named("sentence_id")=sentence_id, Named("token_id")=token_id, Named("token")=token, Named("pos")=pos, Named("subtype")=subtype, Named("analytic")=analytic);
}

0 comments on commit 6a5cce5

Please sign in to comment.