Skip to content

Commit

Permalink
[c++/python/r] Use libtiledbsoma for R schema evolution (#3100)
Browse files Browse the repository at this point in the history
* Push R `update_dataframe_schema` down to C++

* devtools::document()

* DESCRIPTION and NEWS.md

* code-review feedback

Co-authored-by: Paul Hoffman <mojaveazure@users.noreply.github.com>

* unit testing

---------

Co-authored-by: Paul Hoffman <mojaveazure@users.noreply.github.com>
  • Loading branch information
johnkerl and mojaveazure authored Oct 2, 2024
1 parent 2dab4e9 commit 6541cfe
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 27 deletions.
2 changes: 1 addition & 1 deletion apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
like those commonly used for single cell data analysis. It is documented at
<https://github.com/single-cell-data>; a formal specification available is at
<https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
Version: 1.14.99.2
Version: 1.14.99.3
Authors@R: c(
person(given = "Aaron", family = "Wolen",
role = c("cre", "aut"), email = "aaron@tiledb.com",
Expand Down
1 change: 1 addition & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Changes

* Use `libtiledbsoma` for R schema evolution [#3100](https://github.com/single-cell-data/TileDB-SOMA/pull/3100)
* Implement missing `domain` argument to `SOMADataFrame` `create` [#3032](https://github.com/single-cell-data/TileDB-SOMA/pull/3032)
* Remove unused `fragment_count` accessor [#3054](https://github.com/single-cell-data/TileDB-SOMA/pull/3054)

Expand Down
4 changes: 4 additions & 0 deletions apis/r/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ tiledbsoma_upgrade_shape <- function(uri, new_shape, ctxxp) {
invisible(.Call(`_tiledbsoma_tiledbsoma_upgrade_shape`, uri, new_shape, ctxxp))
}

c_update_dataframe_schema <- function(uri, ctxxp, column_names_to_drop, add_cols_types, add_cols_enum_value_types, add_cols_enum_ordered) {
invisible(.Call(`_tiledbsoma_c_update_dataframe_schema`, uri, ctxxp, column_names_to_drop, add_cols_types, add_cols_enum_value_types, add_cols_enum_ordered))
}

#' Iterator-Style Access to SOMA Array via SOMAArray
#'
#' The `sr_*` functions provide low-level access to an instance of the SOMAArray
Expand Down
55 changes: 29 additions & 26 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ SOMADataFrame <- R6::R6Class(
#' prior to performing the update. The name of this new column will be set
#' to the value specified by `row_index_name`.
update = function(values, row_index_name = NULL) {

private$check_open_for_write()
stopifnot(
"'values' must be a data.frame, Arrow Table or RecordBatch" =
Expand Down Expand Up @@ -299,47 +300,49 @@ SOMADataFrame <- R6::R6Class(
new_schema[common_cols]
)

# Drop columns
se <- tiledb::tiledb_array_schema_evolution()
for (drop_col in drop_cols) {
spdl::debug("[SOMADataFrame update]: dropping column '{}'", drop_col)
se <- tiledb::tiledb_array_schema_evolution_drop_attribute(
object = se,
attrname = drop_col
)
}
drop_cols_for_clib <- drop_cols
add_cols_types_for_clib <-
add_cols_enum_value_types_for_clib <-
add_cols_enum_ordered_for_clib <- vector("list", length = length(add_cols))
names(add_cols_types_for_clib) <-
names(add_cols_enum_value_types_for_clib) <-
names(add_cols_enum_ordered_for_clib) <- add_cols

# Add columns
for (add_col in add_cols) {
spdl::debug("[SOMADataFrame update]: adding column '{}'", add_col)

col_type <- new_schema$GetFieldByName(add_col)$type
attr <- tiledb_attr_from_arrow_field(
field = new_schema$GetFieldByName(add_col),
tiledb_create_options = tiledb_create_options
)

if (inherits(col_type, "DictionaryType")) {
spdl::debug(
"[SOMADataFrame update]: adding column '{}' as an enumerated type",
add_col
)
se <- tiledb::tiledb_array_schema_evolution_add_enumeration(
object = se,
name = add_col,
enums = levels(values$GetColumnByName(add_col)$as_vector()),
ordered = col_type$ordered
"[SOMADataFrame update]: adding enum column '{}' index type '{}' value type '{}' ordered {}",
add_col, col_type$index_type$name, col_type$value_type$name, col_type$ordered
)
attr <- tiledb::tiledb_attribute_set_enumeration_name(attr, add_col)
}

se <- tiledb::tiledb_array_schema_evolution_add_attribute(se, attr)
add_cols_types_for_clib[[add_col]] <- col_type$index_type$name
add_cols_enum_value_types_for_clib[[add_col]] <- col_type$value_type$name
add_cols_enum_ordered_for_clib[[add_col]] <- col_type$ordered
} else {
spdl::debug("[SOMADataFrame update]: adding column '{}' type '{}'", add_col, col_type$name)

add_cols_types_for_clib[[add_col]] <- col_type$name
}
}

se <- tiledb::tiledb_array_schema_evolution_array_evolve(se, self$uri)
if (length(drop_cols_for_clib) > 0 || length(add_cols_types_for_clib) > 0) {
c_update_dataframe_schema(
self$uri,
private$.soma_context,
drop_cols_for_clib,
Filter(Negate(is.null), add_cols_types_for_clib),
Filter(Negate(is.null), add_cols_enum_value_types_for_clib),
Filter(Negate(is.null), add_cols_enum_ordered_for_clib)
)
}

# Reopen array for writing with new schema
self$reopen(mode = "WRITE")

spdl::debug("[SOMADataFrame update]: Writing new data")
self$write(values)
},
Expand Down
29 changes: 29 additions & 0 deletions apis/r/man/SOMADataFrame.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions apis/r/src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,21 @@ BEGIN_RCPP
return R_NilValue;
END_RCPP
}
// c_update_dataframe_schema
void c_update_dataframe_schema(const std::string& uri, Rcpp::XPtr<somactx_wrap_t> ctxxp, Rcpp::CharacterVector column_names_to_drop, Rcpp::List add_cols_types, Rcpp::List add_cols_enum_value_types, Rcpp::List add_cols_enum_ordered);
RcppExport SEXP _tiledbsoma_c_update_dataframe_schema(SEXP uriSEXP, SEXP ctxxpSEXP, SEXP column_names_to_dropSEXP, SEXP add_cols_typesSEXP, SEXP add_cols_enum_value_typesSEXP, SEXP add_cols_enum_orderedSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP);
Rcpp::traits::input_parameter< Rcpp::XPtr<somactx_wrap_t> >::type ctxxp(ctxxpSEXP);
Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type column_names_to_drop(column_names_to_dropSEXP);
Rcpp::traits::input_parameter< Rcpp::List >::type add_cols_types(add_cols_typesSEXP);
Rcpp::traits::input_parameter< Rcpp::List >::type add_cols_enum_value_types(add_cols_enum_value_typesSEXP);
Rcpp::traits::input_parameter< Rcpp::List >::type add_cols_enum_ordered(add_cols_enum_orderedSEXP);
c_update_dataframe_schema(uri, ctxxp, column_names_to_drop, add_cols_types, add_cols_enum_value_types, add_cols_enum_ordered);
return R_NilValue;
END_RCPP
}
// sr_setup
Rcpp::XPtr<tdbs::SOMAArray> sr_setup(const std::string& uri, Rcpp::XPtr<somactx_wrap_t> ctxxp, Rcpp::Nullable<Rcpp::CharacterVector> colnames, Rcpp::Nullable<Rcpp::XPtr<tiledb::QueryCondition>> qc, Rcpp::Nullable<Rcpp::List> dim_points, Rcpp::Nullable<Rcpp::List> dim_ranges, std::string batch_size, std::string result_order, Rcpp::Nullable<Rcpp::DatetimeVector> timestamprange, const std::string& loglevel);
RcppExport SEXP _tiledbsoma_sr_setup(SEXP uriSEXP, SEXP ctxxpSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP timestamprangeSEXP, SEXP loglevelSEXP) {
Expand Down Expand Up @@ -722,6 +737,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_tiledbsoma_resize", (DL_FUNC) &_tiledbsoma_resize, 3},
{"_tiledbsoma_resize_soma_joinid", (DL_FUNC) &_tiledbsoma_resize_soma_joinid, 3},
{"_tiledbsoma_tiledbsoma_upgrade_shape", (DL_FUNC) &_tiledbsoma_tiledbsoma_upgrade_shape, 3},
{"_tiledbsoma_c_update_dataframe_schema", (DL_FUNC) &_tiledbsoma_c_update_dataframe_schema, 6},
{"_tiledbsoma_sr_setup", (DL_FUNC) &_tiledbsoma_sr_setup, 10},
{"_tiledbsoma_sr_complete", (DL_FUNC) &_tiledbsoma_sr_complete, 1},
{"_tiledbsoma_create_empty_arrow_table", (DL_FUNC) &_tiledbsoma_create_empty_arrow_table, 0},
Expand Down
81 changes: 81 additions & 0 deletions apis/r/src/rinterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,84 @@ void tiledbsoma_upgrade_shape(
sr->upgrade_shape(new_shape_i64);
sr->close();
}

// [[Rcpp::export]]
void c_update_dataframe_schema(
const std::string& uri,
Rcpp::XPtr<somactx_wrap_t> ctxxp,
Rcpp::CharacterVector column_names_to_drop,
Rcpp::List add_cols_types,
Rcpp::List add_cols_enum_value_types,
Rcpp::List add_cols_enum_ordered) {
// Drop columns is just a list of column names: it goes right through
// from R to C++.
std::vector<std::string> drop_attrs = Rcpp::as<std::vector<std::string>>(
column_names_to_drop);

// For add columns: coming from R we have a named list from attr name to:
// * for non-enum attrs: the datatype of the attr
// * for enum attrs: the index type (e.g. int8) of the enumeration attr
std::map<std::string, std::string> add_attrs;
int n_add = add_cols_types.length();

if (n_add > 0) {
// Calling .names on empty list results in:
// Not compatible with STRSXP: [type=NULL].Abort trap: 6
Rcpp::CharacterVector add_col_names = add_cols_types.names();
for (int i = 0; i < n_add; i++) {
std::string type_name = Rcpp::as<std::string>(add_cols_types[i]);

// Map type names like "int8" in the R Arrow API to type names like
// "c" in the C NanoArrow API. I looked and didn't find an R
// accessor for this; no big deal. Here we remap what we know about,
// and if there's still an unrecognized type name (which is
// developer error, not user error), we will let libtiledbsoma
// throw.
type_name = remap_arrow_type_code_r_to_c(type_name);

add_attrs.emplace(add_col_names[i], type_name);
}
}

// For enum columns, two more things: value type (e.g. string) and
// is-ordered-enum boolean. These come into us as separate lists but
// we will reshape them into a map from enum-attr name to pair of
// (value_type, ordered).

// First do integrity checks.
if (add_cols_enum_value_types.length() != add_cols_enum_ordered.length()) {
// This isn't user error
throw Rcpp::exception(
"c_update_dataframe_schema: internal coding error");
}

std::map<std::string, std::pair<std::string, bool>> add_enmrs;
int n_add_enum = add_cols_enum_value_types.length();
if (n_add_enum > 0) {
// Calling .names on empty list results in:
// Not compatible with STRSXP: [type=NULL].Abort trap: 6
Rcpp::CharacterVector add_enum_col_names = add_cols_enum_value_types
.names();
Rcpp::CharacterVector other_names = add_cols_enum_ordered.names();
for (int i = 0; i < n_add_enum; i++) {
if (add_enum_col_names[i] != other_names[i]) {
// This also isn't user error
throw Rcpp::exception(
"c_update_dataframe_schema: internal coding error");
}
}

for (int i = 0; i < n_add_enum; i++) {
std::string key = Rcpp::as<std::string>(add_enum_col_names[i]);
std::string type_name = Rcpp::as<std::string>(
add_cols_enum_value_types[i]);
type_name = remap_arrow_type_code_r_to_c(type_name);
bool ordered = Rcpp::as<bool>(add_cols_enum_ordered[i]);

add_enmrs.emplace(key, std::pair(type_name, ordered));
}
}

tdbs::SOMADataFrame::update_dataframe_schema(
uri, ctxxp->ctxptr, drop_attrs, add_attrs, add_enmrs);
}
24 changes: 24 additions & 0 deletions apis/r/src/rutilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,3 +501,27 @@ SEXP convert_domainish(const tdbs::ArrowTable& arrow_table) {

return arrayxp;
}

static std::map<std::string, std::string> _type_name_remap = {
{"int8", "c"},
{"int16", "s"},
{"int32", "i"},
{"int64", "l"},
{"uint8", "C"},
{"uint16", "S"},
{"uint32", "I"},
{"uint64", "L"},
{"utf8", "u"},
{"large_utf8", "U"},
{"bool", "b"},
{"float", "f"},
{"double", "g"}};

std::string remap_arrow_type_code_r_to_c(std::string input) {
auto it = _type_name_remap.find(input);
if (it == _type_name_remap.end()) {
return input;
} else {
return it->second;
}
}
3 changes: 3 additions & 0 deletions apis/r/src/rutilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,6 @@ std::vector<int64_t> i64_from_rcpp_numeric(const Rcpp::NumericVector& input);
// * obtain an ArrowTable
// * need to map that to an R list of lo/hi pairs
SEXP convert_domainish(const tdbs::ArrowTable& arrow_table);

// Maps e.g. "int8" and "float32" to "c" and "f".
std::string remap_arrow_type_code_r_to_c(std::string input);

0 comments on commit 6541cfe

Please sign in to comment.