From 6541cfe5f2ec4eb72e45462b1f16b3b9c78f17e6 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 2 Oct 2024 14:52:35 -0400 Subject: [PATCH] [c++/python/r] Use `libtiledbsoma` for R schema evolution (#3100) * Push R `update_dataframe_schema` down to C++ * devtools::document() * DESCRIPTION and NEWS.md * code-review feedback Co-authored-by: Paul Hoffman * unit testing --------- Co-authored-by: Paul Hoffman --- apis/r/DESCRIPTION | 2 +- apis/r/NEWS.md | 1 + apis/r/R/RcppExports.R | 4 ++ apis/r/R/SOMADataFrame.R | 55 +++++++++++++------------ apis/r/man/SOMADataFrame.Rd | 29 +++++++++++++ apis/r/src/RcppExports.cpp | 16 ++++++++ apis/r/src/rinterface.cpp | 81 +++++++++++++++++++++++++++++++++++++ apis/r/src/rutilities.cpp | 24 +++++++++++ apis/r/src/rutilities.h | 3 ++ 9 files changed, 188 insertions(+), 27 deletions(-) diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 8a2eac0540..48ba0ad98f 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices, like those commonly used for single cell data analysis. It is documented at ; a formal specification available is at . -Version: 1.14.99.2 +Version: 1.14.99.3 Authors@R: c( person(given = "Aaron", family = "Wolen", role = c("cre", "aut"), email = "aaron@tiledb.com", diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md index b2ccab779b..4541405113 100644 --- a/apis/r/NEWS.md +++ b/apis/r/NEWS.md @@ -2,6 +2,7 @@ ## Changes +* Use `libtiledbsoma` for R schema evolution [#3100](https://github.com/single-cell-data/TileDB-SOMA/pull/3100) * Implement missing `domain` argument to `SOMADataFrame` `create` [#3032](https://github.com/single-cell-data/TileDB-SOMA/pull/3032) * Remove unused `fragment_count` accessor [#3054](https://github.com/single-cell-data/TileDB-SOMA/pull/3054) diff --git a/apis/r/R/RcppExports.R b/apis/r/R/RcppExports.R index c1d74177a6..9501bf7aac 100644 --- a/apis/r/R/RcppExports.R +++ b/apis/r/R/RcppExports.R @@ -226,6 +226,10 @@ tiledbsoma_upgrade_shape <- function(uri, new_shape, ctxxp) { invisible(.Call(`_tiledbsoma_tiledbsoma_upgrade_shape`, uri, new_shape, ctxxp)) } +c_update_dataframe_schema <- function(uri, ctxxp, column_names_to_drop, add_cols_types, add_cols_enum_value_types, add_cols_enum_ordered) { + invisible(.Call(`_tiledbsoma_c_update_dataframe_schema`, uri, ctxxp, column_names_to_drop, add_cols_types, add_cols_enum_value_types, add_cols_enum_ordered)) +} + #' Iterator-Style Access to SOMA Array via SOMAArray #' #' The `sr_*` functions provide low-level access to an instance of the SOMAArray diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index 0a8c669c99..b560023d20 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -238,6 +238,7 @@ SOMADataFrame <- R6::R6Class( #' prior to performing the update. The name of this new column will be set #' to the value specified by `row_index_name`. update = function(values, row_index_name = NULL) { + private$check_open_for_write() stopifnot( "'values' must be a data.frame, Arrow Table or RecordBatch" = @@ -299,47 +300,49 @@ SOMADataFrame <- R6::R6Class( new_schema[common_cols] ) - # Drop columns - se <- tiledb::tiledb_array_schema_evolution() - for (drop_col in drop_cols) { - spdl::debug("[SOMADataFrame update]: dropping column '{}'", drop_col) - se <- tiledb::tiledb_array_schema_evolution_drop_attribute( - object = se, - attrname = drop_col - ) - } + drop_cols_for_clib <- drop_cols + add_cols_types_for_clib <- + add_cols_enum_value_types_for_clib <- + add_cols_enum_ordered_for_clib <- vector("list", length = length(add_cols)) + names(add_cols_types_for_clib) <- + names(add_cols_enum_value_types_for_clib) <- + names(add_cols_enum_ordered_for_clib) <- add_cols # Add columns for (add_col in add_cols) { - spdl::debug("[SOMADataFrame update]: adding column '{}'", add_col) col_type <- new_schema$GetFieldByName(add_col)$type - attr <- tiledb_attr_from_arrow_field( - field = new_schema$GetFieldByName(add_col), - tiledb_create_options = tiledb_create_options - ) if (inherits(col_type, "DictionaryType")) { spdl::debug( - "[SOMADataFrame update]: adding column '{}' as an enumerated type", - add_col - ) - se <- tiledb::tiledb_array_schema_evolution_add_enumeration( - object = se, - name = add_col, - enums = levels(values$GetColumnByName(add_col)$as_vector()), - ordered = col_type$ordered + "[SOMADataFrame update]: adding enum column '{}' index type '{}' value type '{}' ordered {}", + add_col, col_type$index_type$name, col_type$value_type$name, col_type$ordered ) - attr <- tiledb::tiledb_attribute_set_enumeration_name(attr, add_col) - } - se <- tiledb::tiledb_array_schema_evolution_add_attribute(se, attr) + add_cols_types_for_clib[[add_col]] <- col_type$index_type$name + add_cols_enum_value_types_for_clib[[add_col]] <- col_type$value_type$name + add_cols_enum_ordered_for_clib[[add_col]] <- col_type$ordered + } else { + spdl::debug("[SOMADataFrame update]: adding column '{}' type '{}'", add_col, col_type$name) + + add_cols_types_for_clib[[add_col]] <- col_type$name + } } - se <- tiledb::tiledb_array_schema_evolution_array_evolve(se, self$uri) + if (length(drop_cols_for_clib) > 0 || length(add_cols_types_for_clib) > 0) { + c_update_dataframe_schema( + self$uri, + private$.soma_context, + drop_cols_for_clib, + Filter(Negate(is.null), add_cols_types_for_clib), + Filter(Negate(is.null), add_cols_enum_value_types_for_clib), + Filter(Negate(is.null), add_cols_enum_ordered_for_clib) + ) + } # Reopen array for writing with new schema self$reopen(mode = "WRITE") + spdl::debug("[SOMADataFrame update]: Writing new data") self$write(values) }, diff --git a/apis/r/man/SOMADataFrame.Rd b/apis/r/man/SOMADataFrame.Rd index a66f17affd..103d7e0361 100644 --- a/apis/r/man/SOMADataFrame.Rd +++ b/apis/r/man/SOMADataFrame.Rd @@ -24,6 +24,7 @@ row and is intended to act as a join key for other objects, such as \item \href{#method-SOMADataFrame-domain}{\code{SOMADataFrame$domain()}} \item \href{#method-SOMADataFrame-maxdomain}{\code{SOMADataFrame$maxdomain()}} \item \href{#method-SOMADataFrame-tiledbsoma_has_upgraded_domain}{\code{SOMADataFrame$tiledbsoma_has_upgraded_domain()}} +\item \href{#method-SOMADataFrame-resize_soma_joinid}{\code{SOMADataFrame$resize_soma_joinid()}} \item \href{#method-SOMADataFrame-clone}{\code{SOMADataFrame$clone()}} } } @@ -280,6 +281,34 @@ Logical } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SOMADataFrame-resize_soma_joinid}{}}} +\subsection{Method \code{resize_soma_joinid()}}{ +Increases the shape of the dataframe on the \code{soma_joinid} +index column, if it indeed is an index column, leaving all other index +columns as-is. If the \code{soma_joinid} is not an index column, no change is +made. This is a special case of \code{upgrade_domain} (WIP for 1.15), but +simpler to keystroke, and handles the most common case for dataframe +domain expansion. Raises an error if the dataframe doesn't already have a +domain: in that case please call \code{tiledbsoma_upgrade_domain} (WIP for +1.15). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SOMADataFrame$resize_soma_joinid(new_shape)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{new_shape}}{An integer, greater than or equal to 1 + the +\code{soma_joinid} domain slot.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +No return value +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-SOMADataFrame-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 832bdb4199..0e8859810e 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -512,6 +512,21 @@ BEGIN_RCPP return R_NilValue; END_RCPP } +// c_update_dataframe_schema +void c_update_dataframe_schema(const std::string& uri, Rcpp::XPtr ctxxp, Rcpp::CharacterVector column_names_to_drop, Rcpp::List add_cols_types, Rcpp::List add_cols_enum_value_types, Rcpp::List add_cols_enum_ordered); +RcppExport SEXP _tiledbsoma_c_update_dataframe_schema(SEXP uriSEXP, SEXP ctxxpSEXP, SEXP column_names_to_dropSEXP, SEXP add_cols_typesSEXP, SEXP add_cols_enum_value_typesSEXP, SEXP add_cols_enum_orderedSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); + Rcpp::traits::input_parameter< Rcpp::XPtr >::type ctxxp(ctxxpSEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type column_names_to_drop(column_names_to_dropSEXP); + Rcpp::traits::input_parameter< Rcpp::List >::type add_cols_types(add_cols_typesSEXP); + Rcpp::traits::input_parameter< Rcpp::List >::type add_cols_enum_value_types(add_cols_enum_value_typesSEXP); + Rcpp::traits::input_parameter< Rcpp::List >::type add_cols_enum_ordered(add_cols_enum_orderedSEXP); + c_update_dataframe_schema(uri, ctxxp, column_names_to_drop, add_cols_types, add_cols_enum_value_types, add_cols_enum_ordered); + return R_NilValue; +END_RCPP +} // sr_setup Rcpp::XPtr sr_setup(const std::string& uri, Rcpp::XPtr ctxxp, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, Rcpp::Nullable timestamprange, const std::string& loglevel); RcppExport SEXP _tiledbsoma_sr_setup(SEXP uriSEXP, SEXP ctxxpSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP timestamprangeSEXP, SEXP loglevelSEXP) { @@ -722,6 +737,7 @@ static const R_CallMethodDef CallEntries[] = { {"_tiledbsoma_resize", (DL_FUNC) &_tiledbsoma_resize, 3}, {"_tiledbsoma_resize_soma_joinid", (DL_FUNC) &_tiledbsoma_resize_soma_joinid, 3}, {"_tiledbsoma_tiledbsoma_upgrade_shape", (DL_FUNC) &_tiledbsoma_tiledbsoma_upgrade_shape, 3}, + {"_tiledbsoma_c_update_dataframe_schema", (DL_FUNC) &_tiledbsoma_c_update_dataframe_schema, 6}, {"_tiledbsoma_sr_setup", (DL_FUNC) &_tiledbsoma_sr_setup, 10}, {"_tiledbsoma_sr_complete", (DL_FUNC) &_tiledbsoma_sr_complete, 1}, {"_tiledbsoma_create_empty_arrow_table", (DL_FUNC) &_tiledbsoma_create_empty_arrow_table, 0}, diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 2386db3324..a1e84ccc11 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -415,3 +415,84 @@ void tiledbsoma_upgrade_shape( sr->upgrade_shape(new_shape_i64); sr->close(); } + +// [[Rcpp::export]] +void c_update_dataframe_schema( + const std::string& uri, + Rcpp::XPtr ctxxp, + Rcpp::CharacterVector column_names_to_drop, + Rcpp::List add_cols_types, + Rcpp::List add_cols_enum_value_types, + Rcpp::List add_cols_enum_ordered) { + // Drop columns is just a list of column names: it goes right through + // from R to C++. + std::vector drop_attrs = Rcpp::as>( + column_names_to_drop); + + // For add columns: coming from R we have a named list from attr name to: + // * for non-enum attrs: the datatype of the attr + // * for enum attrs: the index type (e.g. int8) of the enumeration attr + std::map add_attrs; + int n_add = add_cols_types.length(); + + if (n_add > 0) { + // Calling .names on empty list results in: + // Not compatible with STRSXP: [type=NULL].Abort trap: 6 + Rcpp::CharacterVector add_col_names = add_cols_types.names(); + for (int i = 0; i < n_add; i++) { + std::string type_name = Rcpp::as(add_cols_types[i]); + + // Map type names like "int8" in the R Arrow API to type names like + // "c" in the C NanoArrow API. I looked and didn't find an R + // accessor for this; no big deal. Here we remap what we know about, + // and if there's still an unrecognized type name (which is + // developer error, not user error), we will let libtiledbsoma + // throw. + type_name = remap_arrow_type_code_r_to_c(type_name); + + add_attrs.emplace(add_col_names[i], type_name); + } + } + + // For enum columns, two more things: value type (e.g. string) and + // is-ordered-enum boolean. These come into us as separate lists but + // we will reshape them into a map from enum-attr name to pair of + // (value_type, ordered). + + // First do integrity checks. + if (add_cols_enum_value_types.length() != add_cols_enum_ordered.length()) { + // This isn't user error + throw Rcpp::exception( + "c_update_dataframe_schema: internal coding error"); + } + + std::map> add_enmrs; + int n_add_enum = add_cols_enum_value_types.length(); + if (n_add_enum > 0) { + // Calling .names on empty list results in: + // Not compatible with STRSXP: [type=NULL].Abort trap: 6 + Rcpp::CharacterVector add_enum_col_names = add_cols_enum_value_types + .names(); + Rcpp::CharacterVector other_names = add_cols_enum_ordered.names(); + for (int i = 0; i < n_add_enum; i++) { + if (add_enum_col_names[i] != other_names[i]) { + // This also isn't user error + throw Rcpp::exception( + "c_update_dataframe_schema: internal coding error"); + } + } + + for (int i = 0; i < n_add_enum; i++) { + std::string key = Rcpp::as(add_enum_col_names[i]); + std::string type_name = Rcpp::as( + add_cols_enum_value_types[i]); + type_name = remap_arrow_type_code_r_to_c(type_name); + bool ordered = Rcpp::as(add_cols_enum_ordered[i]); + + add_enmrs.emplace(key, std::pair(type_name, ordered)); + } + } + + tdbs::SOMADataFrame::update_dataframe_schema( + uri, ctxxp->ctxptr, drop_attrs, add_attrs, add_enmrs); +} diff --git a/apis/r/src/rutilities.cpp b/apis/r/src/rutilities.cpp index 913d00b6a7..78b848923b 100644 --- a/apis/r/src/rutilities.cpp +++ b/apis/r/src/rutilities.cpp @@ -501,3 +501,27 @@ SEXP convert_domainish(const tdbs::ArrowTable& arrow_table) { return arrayxp; } + +static std::map _type_name_remap = { + {"int8", "c"}, + {"int16", "s"}, + {"int32", "i"}, + {"int64", "l"}, + {"uint8", "C"}, + {"uint16", "S"}, + {"uint32", "I"}, + {"uint64", "L"}, + {"utf8", "u"}, + {"large_utf8", "U"}, + {"bool", "b"}, + {"float", "f"}, + {"double", "g"}}; + +std::string remap_arrow_type_code_r_to_c(std::string input) { + auto it = _type_name_remap.find(input); + if (it == _type_name_remap.end()) { + return input; + } else { + return it->second; + } +} diff --git a/apis/r/src/rutilities.h b/apis/r/src/rutilities.h index 03990c08ef..836a205764 100644 --- a/apis/r/src/rutilities.h +++ b/apis/r/src/rutilities.h @@ -76,3 +76,6 @@ std::vector i64_from_rcpp_numeric(const Rcpp::NumericVector& input); // * obtain an ArrowTable // * need to map that to an R list of lo/hi pairs SEXP convert_domainish(const tdbs::ArrowTable& arrow_table); + +// Maps e.g. "int8" and "float32" to "c" and "f". +std::string remap_arrow_type_code_r_to_c(std::string input);