Refactor get_metadata() to use arbitrary parquet files

Conflicts: R/metadata.R
stemangiola · Feb 22, 2024 · 7efdf77 · 7efdf77
2 parents fd2009a + 78b8c0f
commit 7efdf77
Show file tree

Hide file tree

Showing 12 changed files with 79 additions and 50 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(as.sparse,DelayedMatrix)
-export(DATABASE_URL)
+export(get_database_url)
 export(SAMPLE_DATABASE_URL)
 export(get_SingleCellExperiment)
 export(get_metadata)
@@ -43,6 +43,7 @@ importFrom(dplyr,tbl)
 importFrom(dplyr,transmute)
 importFrom(duckdb,duckdb)
 importFrom(glue,glue)
+importFrom(glue,glue_sql)
 importFrom(httr,GET)
 importFrom(httr,HEAD)
 importFrom(httr,modify_url)
@@ -61,6 +62,7 @@ importFrom(purrr,pmap_chr)
 importFrom(purrr,reduce)
 importFrom(purrr,set_names)
 importFrom(purrr,walk)
+importFrom(purrr,walk2)
 importFrom(rlang,.data)
 importFrom(stats,setNames)
 importFrom(stringr,str_remove_all)

diff --git a/R/counts.R b/R/counts.R
@@ -112,7 +112,7 @@ get_single_cell_experiment <- function(
         has_name(raw_data, c("cell_", "file_id_db"))
     )
 
-    versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
+    versioned_cache_directory <- cache_directory
     versioned_cache_directory |> dir.create(
         showWarnings = FALSE,
         recursive = TRUE

diff --git a/R/dev.R b/R/dev.R
@@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
 #' @keywords internal
 #' @return A character vector of the newly-created anndata files
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' hdf5_to_anndata(
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
@@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){
 # @return A character vector of the newly-created anndata files
 # @noRd
 # @examples
-# \donttest{
+# \dontrun{
 # h5seurat_to_anndata(
 #     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
 #     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"

diff --git a/R/metadata.R b/R/metadata.R
@@ -9,16 +9,20 @@ cache <- rlang::env(
     metadata_table = rlang::env()
 )
 
-#' URL pointing to the full metadata file
+#' Returns the URLs for all metadata files 
 #' @export
-#' @return A character scalar consisting of the URL
+#' @return A named character vector whose names are parquet file names, and whose values are URLs
 #' @examples
-#' get_metadata(remote_url = DATABASE_URL)
-DATABASE_URL <- single_line_str(
-    "https://object-store.rc.nectar.org.au/v1/
-    AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
-)
+#' get_metadata(remote_url = get_database_url("metadata.0.2.3.parquet"))
+
 
+get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
+    glue::glue(
+      "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}"
+    )
+}
+
+
 #' URL pointing to the sample metadata file, which is smaller and for test,
 #' demonstration, and vignette purposes only
 #' @export
@@ -38,8 +42,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
 #' into [get_single_cell_experiment()] to obtain a
 #' [`SingleCellExperiment::SingleCellExperiment-class`]
 #'
-#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
-#'   to the location of the parquet database.
+#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
+#'   to the name and location of parquet database/databases.
 #' @param cache_directory Optional character vector of length 1. A file path on
 #'   your local system to a directory (not a file) that will be used to store
 #'   `metadata.parquet`
@@ -68,6 +72,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
 #' @importFrom httr progress
 #' @importFrom cli cli_alert_info hash_sha256
 #' @importFrom glue glue
+#' @importFrom purrr walk
 #'
 #' @details
 #'
@@ -139,34 +144,40 @@ SAMPLE_DATABASE_URL <- single_line_str(
 #' The solution is to choose a different cache, for example
 #' ```R
 #' get_metadata(cache_directory = path.expand('~'))
-#' ```
 get_metadata <- function(
-    remote_url = DATABASE_URL,
+    remote_url = get_database_url(),
     cache_directory = get_default_cache_dir(),
     use_cache = TRUE
 ) {
-    hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
-        hash_sha256()
-    cached_connection <- cache$metadata_table[[hash]]
-    if (!is.null(cached_connection) && isTRUE(use_cache)) {
-        cached_connection
-    }
-    else {
-        db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")
-
-        if (!file.exists(db_path)){
-            report_file_sizes(remote_url)
-            sync_remote_file(
-                remote_url,
-                db_path,
-                progress(type = "down", con = stderr())
-            )
+      # Download remote files if needed
+      walk(remote_url, function(url) {
+        path <- file.path(cache_directory, url |> basename())
+        if (!file.exists(path)) {
+          report_file_sizes(url)
+          sync_remote_file(url,
+                           path,
+                           progress(type = "down", con = stderr()))
         }
+      })
+
+        all_metadata <- list.files(cache_directory, pattern="*.parquet")
+        hash <- all_metadata |>
+            paste0(collapse="") |>
+            hash_sha256()
 
-        table <- duckdb() |>
-            dbConnect(drv = _, read_only = TRUE) |>
-            read_parquet(db_path)
-        cache$metadata_table[[hash]] <- table
-        table
+        cached_connection <- cache$metadata_table[[hash]]
+        if (!is.null(cached_connection) && isTRUE(use_cache)) {
+            # Get hashed copy of the table if no parquet files have changed
+            cached_connection
+        }
+        else{
+            # Load all parquet files as data frame
+            table <- duckdb() |>
+                dbConnect(drv = _, read_only = TRUE) |>
+                read_parquet()
+            cache$metadata_table[[hash]] <- table
+            table
+        }
     }
 }
+
diff --git a/R/unharmonised.R b/R/unharmonised.R
@@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str(
 #' @return A named list, where each name is a dataset file ID, and each value is
 #'   a "lazy data frame", ie a `tbl`.
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b"
 #' harmonised_meta <- get_metadata() |> 
 #'     dplyr::filter(file_id == dataset) |> dplyr::collect()
@@ -54,7 +54,6 @@ get_unharmonised_dataset <- function(
 ){
     unharmonised_root <- file.path(
       cache_directory,
-      COUNTS_VERSION,
       "unharmonised"
     )
     file_name <- glue::glue("{dataset_id}.parquet")

diff --git a/R/utils.R b/R/utils.R
@@ -41,7 +41,7 @@ single_line_str <- function(text){
     str_remove_all(text, r"(\n\s*)")
 }
 
-#' Returns the default cache directory
+#' Returns the default cache directory with a version number
 #' @return A length one character vector.
 #' @importFrom tools R_user_dir
 #' @importFrom utils packageName
@@ -51,6 +51,7 @@ get_default_cache_dir <- function() {
         R_user_dir(
             "cache"
         ) |>
+        file.path(COUNTS_VERSION) |>
         normalizePath() |>
         suppressWarnings()
 }
@@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) {
 #' @importFrom glue glue
 #' @importFrom dplyr tbl
 #' @importFrom dbplyr sql
+#' @importFrom glue glue_sql
 #' @return An SQL data frame
 #' @keywords internal
 read_parquet <- function(conn, path){
-    from_clause <- glue("FROM read_parquet('{path}')") |> sql()
+    from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql()
     tbl(conn, from_clause)
 }
 

diff --git a/man/DATABASE_URL.Rd b/man/DATABASE_URL.Rd
diff --git a/man/get_metadata.Rd b/man/get_metadata.Rd
diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd
diff --git a/man/hdf5_to_anndata.Rd b/man/hdf5_to_anndata.Rd
diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
@@ -19,7 +19,7 @@ test_that("get_default_cache_dir() returns the correct directory on Linux", {
     grepl("linux", version$platform, fixed = TRUE) |>
         skip_if_not()
 
-    "~/.cache/R/CuratedAtlasQueryR" |>
+    "~/.cache/R/CuratedAtlasQueryR/0.2.1" |>
         normalizePath() |>
         expect_equal(
             get_default_cache_dir(),
@@ -131,7 +131,7 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {
 
     # Load the SCE from cache directly
     assay_1 = CuratedAtlasQueryR:::get_default_cache_dir() |>
-        file.path(CuratedAtlasQueryR:::COUNTS_VERSION, "original", id) |>
+        file.path("original", id) |>
         HDF5Array::loadHDF5SummarizedExperiment() |>
         assay("X") |>
         as.matrix()
@@ -190,3 +190,18 @@ test_that("get_metadata() is cached", {
 
     identical(table, table_2) |> expect_true()
 })
+
+test_that("database_url() expect character ", {
+  get_database_url() |>
+    expect_s3_class("character")
+})
+
+
+test_that("get_metadata() expect a unique cell_type `b` is present, which comes from fibrosis database", {
+  n_cell <- get_metadata() |> filter(cell_type_harmonised == 'b') |> as_tibble() |> nrow()
+  expect_true(n_cell > 0)
+})
+
+
+
+
diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd
@@ -51,7 +51,7 @@ find_figure <- function(names){
 }
 METADATA_URL = if (params$demo_metadata)
         CuratedAtlasQueryR::SAMPLE_DATABASE_URL else
-        CuratedAtlasQueryR::DATABASE_URL
+        CuratedAtlasQueryR::get_database_url
 ```
 
 <!-- badges: start -->