From c50cb98fe3dc8e5142c253c02556c2183a72a88a Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 3 Oct 2024 17:36:03 -0400 Subject: [PATCH] [r] Push `schema` accessor down to `libtiledbsoma` (#3079) * [r] Push `schema` accessor down to `libtiledbsoma` * debug [skip ci] * unit-testing * close some dangling-open unit-test handles * fix unit-test failures * code-review feedback Co-authored-by: Paul Hoffman * DESCRIPTION and NEWS.md [skip ci] * one more spot * rebase --------- Co-authored-by: Paul Hoffman --- apis/r/DESCRIPTION | 2 +- apis/r/NEWS.md | 1 + apis/r/R/RcppExports.R | 4 ++ apis/r/R/SOMADataFrame.R | 5 ++ apis/r/R/SOMADenseNDArray.R | 2 +- apis/r/R/TileDBArray.R | 3 +- apis/r/src/RcppExports.cpp | 13 +++++ apis/r/src/rinterface.cpp | 28 ++++++++++ apis/r/tests/testthat/test-SOMADataFrame.R | 59 ++++++++++++++++------ 9 files changed, 98 insertions(+), 19 deletions(-) diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION index 24580ba4ee..c00b1237d3 100644 --- a/apis/r/DESCRIPTION +++ b/apis/r/DESCRIPTION @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices, like those commonly used for single cell data analysis. It is documented at ; a formal specification available is at . -Version: 1.14.99.4 +Version: 1.14.99.5 Authors@R: c( person(given = "Aaron", family = "Wolen", role = c("cre", "aut"), email = "aaron@tiledb.com", diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md index 880158f287..391bec9118 100644 --- a/apis/r/NEWS.md +++ b/apis/r/NEWS.md @@ -2,6 +2,7 @@ ## Changes +* Push `schema` accessor down to `libtiledbsoma` [#3079](https://github.com/single-cell-data/TileDB-SOMA/pull/3079) * Push `attrnames` down to C++ [#3121](https://github.com/single-cell-data/TileDB-SOMA/pull/3121) * Use `libtiledbsoma` for R schema evolution [#3100](https://github.com/single-cell-data/TileDB-SOMA/pull/3100) * Implement missing `domain` argument to `SOMADataFrame` `create` [#3032](https://github.com/single-cell-data/TileDB-SOMA/pull/3032) diff --git a/apis/r/R/RcppExports.R b/apis/r/R/RcppExports.R index 7fe35618d3..9c672052f5 100644 --- a/apis/r/R/RcppExports.R +++ b/apis/r/R/RcppExports.R @@ -218,6 +218,10 @@ c_attrnames <- function(uri, ctxxp) { .Call(`_tiledbsoma_c_attrnames`, uri, ctxxp) } +c_schema <- function(uri, ctxxp) { + .Call(`_tiledbsoma_c_schema`, uri, ctxxp) +} + resize <- function(uri, new_shape, ctxxp) { invisible(.Call(`_tiledbsoma_resize`, uri, new_shape, ctxxp)) } diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index b560023d20..abaae46828 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -245,6 +245,11 @@ SOMADataFrame <- R6::R6Class( is.data.frame(values) || is_arrow_table(values) || is_arrow_record_batch(values) ) + # Leave state unmodified + # TODO: this issue will automatically go away on https://github.com/single-cell-data/TileDB-SOMA/issues/3059 + omode <- self$mode() + on.exit(self$reopen(mode = omode)) + if (is.data.frame(values)) { if (!is.null(row_index_name)) { stopifnot( diff --git a/apis/r/R/SOMADenseNDArray.R b/apis/r/R/SOMADenseNDArray.R index 6cf8cb31c3..f6e12f34a3 100644 --- a/apis/r/R/SOMADenseNDArray.R +++ b/apis/r/R/SOMADenseNDArray.R @@ -128,7 +128,7 @@ SOMADenseNDArray <- R6::R6Class( "'coords' must be a list of integer vectors" = is.list(coords) && all(vapply_lgl(coords, is.integer)), "length of 'coords' must match number of dimensions" = - length(coords) == length(self$dimensions()) + length(coords) == self$ndim() ) ## the 'soma_data' data type may not have been cached, and if so we need to fetch it diff --git a/apis/r/R/TileDBArray.R b/apis/r/R/TileDBArray.R index 4b9e25f96e..2d4ec6aa6c 100644 --- a/apis/r/R/TileDBArray.R +++ b/apis/r/R/TileDBArray.R @@ -132,7 +132,8 @@ TileDBArray <- R6::R6Class( #' @description Retrieve the array schema as an Arrow schema (lifecycle: maturing) #' @return A [`arrow::schema`] object schema = function() { - arrow_schema_from_tiledb_schema(tiledb::schema(self$object)) + arrow::as_schema( + c_schema(self$uri, private$.soma_context)); }, #' @description Retrieve the array schema as TileDB schema (lifecycle: maturing) diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 11ff65a529..bc936a54a9 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -488,6 +488,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// c_schema +SEXP c_schema(const std::string& uri, Rcpp::XPtr ctxxp); +RcppExport SEXP _tiledbsoma_c_schema(SEXP uriSEXP, SEXP ctxxpSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); + Rcpp::traits::input_parameter< Rcpp::XPtr >::type ctxxp(ctxxpSEXP); + rcpp_result_gen = Rcpp::wrap(c_schema(uri, ctxxp)); + return rcpp_result_gen; +END_RCPP +} // resize void resize(const std::string& uri, Rcpp::NumericVector new_shape, Rcpp::XPtr ctxxp); RcppExport SEXP _tiledbsoma_resize(SEXP uriSEXP, SEXP new_shapeSEXP, SEXP ctxxpSEXP) { @@ -747,6 +759,7 @@ static const R_CallMethodDef CallEntries[] = { {"_tiledbsoma_ndim", (DL_FUNC) &_tiledbsoma_ndim, 2}, {"_tiledbsoma_c_dimnames", (DL_FUNC) &_tiledbsoma_c_dimnames, 2}, {"_tiledbsoma_c_attrnames", (DL_FUNC) &_tiledbsoma_c_attrnames, 2}, + {"_tiledbsoma_c_schema", (DL_FUNC) &_tiledbsoma_c_schema, 2}, {"_tiledbsoma_resize", (DL_FUNC) &_tiledbsoma_resize, 3}, {"_tiledbsoma_resize_soma_joinid", (DL_FUNC) &_tiledbsoma_resize_soma_joinid, 3}, {"_tiledbsoma_tiledbsoma_upgrade_shape", (DL_FUNC) &_tiledbsoma_tiledbsoma_upgrade_shape, 3}, diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index 8176fa1230..42f1013642 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -389,6 +389,34 @@ Rcpp::CharacterVector c_attrnames( return retval; } +// [[Rcpp::export]] +SEXP c_schema(const std::string& uri, Rcpp::XPtr ctxxp) { + auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, ctxxp->ctxptr); + std::unique_ptr lib_retval = sr->arrow_schema(); + sr->close(); + + auto schemaxp = nanoarrow_schema_owning_xptr(); + auto sch = nanoarrow_output_schema_from_xptr(schemaxp); + exitIfError( + ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init"); + exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name"); + exitIfError( + ArrowSchemaAllocateChildren(sch, lib_retval->n_children), + "Bad schema children alloc"); + + for (size_t i = 0; i < lib_retval->n_children; i++) { + spdl::info( + "[c_schema] Accessing name '{}' format '{}' at position {}", + std::string(lib_retval->children[i]->name), + std::string(lib_retval->children[i]->format), + i); + + ArrowSchemaMove(lib_retval->children[i], sch->children[i]); + } + + return schemaxp; +} + // [[Rcpp::export]] void resize( const std::string& uri, diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index c5974878e9..64a8334acd 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -617,33 +617,47 @@ test_that("SOMADataFrame can be updated", { uri <- withr::local_tempdir("soma-dataframe-update") if (dir.exists(uri)) unlink(uri, recursive=TRUE) sdf <- create_and_populate_soma_dataframe(uri, nrows = 10L) + sdf$close(); # Retrieve the table from disk - tbl0 <- SOMADataFrameOpen(uri, "READ")$read()$concat() + sdf <- SOMADataFrameOpen(uri, "READ") + tbl0 <- sdf$read()$concat() + sdf$close() # Remove a column and update tbl0$float_column <- NULL - sdf <- SOMADataFrameOpen(uri, "WRITE")$update(tbl0) + sdf <- SOMADataFrameOpen(uri, "WRITE") + sdf$update(tbl0) + sdf$close() # Verify attribute was removed on disk - tbl1 <- SOMADataFrameOpen(uri, "READ")$read()$concat() + sdf <- SOMADataFrameOpen(uri, "READ") + tbl1 <- sdf$read()$concat() expect_true(tbl1$Equals(tbl0)) + sdf$close() # # Add a new column and update tbl0$float_column <- sample(c(TRUE, FALSE), nrow(tbl0), replace = TRUE) - sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0) + sdf <- SOMADataFrameOpen(uri, "WRITE") + sdf$update(tbl0) + sdf$close() # Verify attribute was added on disk - tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat() + sdf <- SOMADataFrameOpen(uri, "READ") + tbl1 <- sdf$read()$concat() expect_true(tbl1$Equals(tbl0)) + sdf$close() # Add a new enum and update tbl0$frobo <- factor(sample(letters[1:3], nrow(tbl0), replace = TRUE)) - expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0)) + sdf <- SOMADataFrameOpen(uri, "WRITE") + expect_no_condition(sdf$update(tbl0)) + sdf$close() # Verify enum was added on disk + sdf <- SOMADataFrameOpen(uri, "READ") expect_s3_class( - tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(), + tbl1 <- sdf$read()$concat(), "Table" ) expect_identical(as.data.frame(tbl1), as.data.frame(tbl0)) @@ -652,6 +666,7 @@ test_that("SOMADataFrame can be updated", { "factor", exact = TRUE ) + sdf$close() # Add a new enum where levels aren't in appearance- or alphabetical-order tbl0 <- tbl1 @@ -663,11 +678,14 @@ test_that("SOMADataFrame can be updated", { levels(tbl0$GetColumnByName("rlvl")$as_vector()), c("green", "red", "blue") ) - expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0)) + sdf <- SOMADataFrameOpen(uri, "WRITE") + expect_no_condition(sdf$update(tbl0)) + sdf$close() # Verify unordered enum was added on disk + sdf <- SOMADataFrameOpen(uri, "READ") expect_s3_class( - tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(), + tbl1 <- sdf$read()$concat(), "Table" ) expect_identical(as.data.frame(tbl1), as.data.frame(tbl0)) @@ -684,26 +702,31 @@ test_that("SOMADataFrame can be updated", { # Verify queryability expect_s3_class( - tblq <- SOMADataFrameOpen(uri, mode = "READ")$read(value_filter = 'rlvl == "green"')$concat(), + tblq <- sdf$read(value_filter = 'rlvl == "green"')$concat(), "Table" ) expect_length(tblq[["rlvl"]], 3) expect_s3_class( - tblq <- SOMADataFrameOpen(uri, mode = "READ")$read(value_filter = 'rlvl %in% c("blue", "green")')$concat(), + tblq <- sdf$read(value_filter = 'rlvl %in% c("blue", "green")')$concat(), "Table" ) expect_length(tblq[["rlvl"]], 6) + sdf$close() # Add a new ordered and update tbl0 <- tbl1 tbl0$ord <- ordered(sample(c("g1", "g2", "g3"), nrow(tbl0), replace = TRUE)) - expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0)) + sdf <- SOMADataFrameOpen(uri, "WRITE") + expect_no_condition(sdf$update(tbl0)) + sdf$close() # Verify ordered was added on disk + sdf <- SOMADataFrameOpen(uri, "READ") expect_s3_class( - tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(), + tbl1 <- sdf$read()$concat(), "Table" ) + sdf$close() # Read ordered enums expect_identical(as.data.frame(tbl1), as.data.frame(tbl0)) @@ -715,16 +738,18 @@ test_that("SOMADataFrame can be updated", { # Error if attempting to drop an array dimension tbl0$int_column <- NULL # drop the indexed dimension + sdf <- SOMADataFrameOpen(uri, "WRITE") expect_error( - SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0), + sdf$update(tbl0), "The following indexed field does not exist" ) tbl0 <- tbl1 + # Error on incompatible schema updates tbl0$string_column <- tbl0$string_column$cast(target_type = arrow::int32()) # string to int expect_error( - SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0), + sdf$update(tbl0), "Schemas are incompatible" ) tbl0 <- tbl1 @@ -732,9 +757,11 @@ test_that("SOMADataFrame can be updated", { # Error if the number of rows changes tbl0 <- tbl0$Slice(offset = 1, length = tbl0$num_rows - 1) expect_error( - SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0), + sdf$update(tbl0), "Number of rows in 'values' must match number of rows in array" ) + + sdf$close() }) test_that("SOMADataFrame can be updated from a data frame", {