Skip to content

Rewrite ngram implementation with an eye for cpp11 performance #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 66 additions & 61 deletions src/code.cpp
Original file line number Diff line number Diff line change
@@ -1,87 +1,92 @@
#include <cpp11.hpp>
#include <string>
using namespace cpp11;

writable::strings ngram_single(strings x, int n, std::string delim_string) {

if (n == 1) {
return(x);
}

int len = x.size();
int range = std::max(len - n + 1, 0);

writable::strings res (range);

if (range == 0) {
return(res);
}

for (int i = 0; i < range; ++i) {
std::string elt = cpp11::r_string(x[i]);
for(int j = 1; j < n; ++j) {
std::string elt_other = cpp11::r_string(x[i + j]);
elt = elt + delim_string + elt_other;
static
void
fill_one_ngram(const cpp11::strings& x,
int n,
const std::string& delim,
cpp11::writable::strings& out,
R_xlen_t& loc) {
const R_xlen_t x_size = x.size();
const R_xlen_t range = std::max(x_size - n + 1, static_cast<R_xlen_t>(0));

// Not strictly necessary to call `unwind_protect()` here because we also
// call it in `cpp11_ngram()`, but it seems like it would be good practice
// to call it here too because this is where we leave cpp11 and use the less
// safe but faster R API directly.
cpp11::unwind_protect([&] {
for (R_xlen_t i = 0; i < range; ++i) {
// `x[i]` goes through `r_string`, and that is too expensive because
// generating each `r_string` involves protecting each CHARSXP.
std::string elt = CHAR(STRING_ELT(x, i));

for (R_xlen_t j = 1; j < n; ++j) {
const std::string piece = CHAR(STRING_ELT(x, i + j));
elt = elt + delim + piece;
}

// `out[i] = elt` would approximately do the same thing, but it goes
// through `std::string elt` -> `r_string` before assigning, and that
// again involves protecting each `r_string`, which is expensive and not
// necessary.
SET_STRING_ELT(out, loc, Rf_mkCharLenCE(elt.data(), elt.size(), CE_UTF8));
++loc;
}
res[i] = elt;
}

return(res);
});
}


strings ngram(strings x, int n, int n_min, std::string delim_string) {

// The following 5 lines of code was done to pre-allocate
// I will try to convert to push_back()
int res_len = 0;
int x_len = x.size();
static
cpp11::writable::strings
ngram(const cpp11::strings& x,
int n,
int n_min,
const std::string& delim) {
R_xlen_t out_size = 0;
const R_xlen_t x_size = x.size();

for (int i = n_min; i <= n; ++i) {
res_len += std::max(x_len - i + 1, 0);
out_size += std::max(x_size - i + 1, static_cast<R_xlen_t>(0));
}

writable::strings res (res_len);
writable::strings temp_res;
int index = 0;
cpp11::writable::strings out(out_size);
R_xlen_t loc = 0;

for (int i = n_min; i <= n; ++i) {

temp_res = ngram_single(x, i, delim_string);
int temp_res_len = temp_res.size();

for(int j = 0; j < temp_res_len; ++j) {
res[index] = cpp11::r_string(temp_res[j]);
++index;
// res.push_back(temp_res[j]);
}
fill_one_ngram(x, i, delim, out, loc);
}

return(res);
return(out);
}

[[cpp11::register]]
list cpp11_ngram(list x, int n, int n_min, cpp11::r_string delim) {

std::string delim_string = cpp11::as_cpp<std::string>(delim);

cpp11::writable::list_of<cpp11::writable::strings>
cpp11_ngram(cpp11::list_of<cpp11::strings> x,
int n,
int n_min,
std::string delim) {
if (n <= 0) {
stop("n must be a positive integer.");
cpp11::stop("n must be a positive integer.");
}

if (n_min <= 0) {
stop("n_min must be a positive integer.");
cpp11::stop("n_min must be a positive integer.");
}

if (n_min > n) {
stop("n_min must be larger then n.");
cpp11::stop("n_min must be larger then n.");
}

int len = x.size();
writable::list res (len);
const R_xlen_t x_size = x.size();
cpp11::writable::list_of<cpp11::writable::strings> out(x_size);

for (int i = 0; i < len; ++i) {
res[i] = ngram(x[i], n, n_min, delim_string);
}
return(res);
// Calling `unwind_protect()` here because each call to `ngram()` allocates
// a `cpp11::writable::strings` vector, and that allocation calls
// `unwind_protect()` too, so `unwind_protect()` would be run `x_size` times.
// Calling it here "turns off" the nested inner calls to it.
cpp11::unwind_protect([&] {
for (R_xlen_t i = 0; i < x_size; ++i) {
out[i] = ngram(x[i], n, n_min, delim);
}
});

return(out);
}
4 changes: 2 additions & 2 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
#include <R_ext/Visibility.h>

// code.cpp
list cpp11_ngram(list x, int n, int n_min, cpp11::r_string delim);
cpp11::writable::list_of<cpp11::writable::strings> cpp11_ngram(cpp11::list_of<cpp11::strings> x, int n, int n_min, std::string delim);
extern "C" SEXP _cpp11ngram_cpp11_ngram(SEXP x, SEXP n, SEXP n_min, SEXP delim) {
BEGIN_CPP11
return cpp11::as_sexp(cpp11_ngram(cpp11::as_cpp<cpp11::decay_t<list>>(x), cpp11::as_cpp<cpp11::decay_t<int>>(n), cpp11::as_cpp<cpp11::decay_t<int>>(n_min), cpp11::as_cpp<cpp11::decay_t<cpp11::r_string>>(delim)));
return cpp11::as_sexp(cpp11_ngram(cpp11::as_cpp<cpp11::decay_t<cpp11::list_of<cpp11::strings>>>(x), cpp11::as_cpp<cpp11::decay_t<int>>(n), cpp11::as_cpp<cpp11::decay_t<int>>(n_min), cpp11::as_cpp<cpp11::decay_t<std::string>>(delim)));
END_CPP11
}

Expand Down