diff --git a/NAMESPACE b/NAMESPACE index eef68d0..b453e4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(as.sparse,DelayedMatrix) -export(DATABASE_URL) +export(get_database_url) export(SAMPLE_DATABASE_URL) export(get_SingleCellExperiment) export(get_metadata) @@ -43,6 +43,7 @@ importFrom(dplyr,tbl) importFrom(dplyr,transmute) importFrom(duckdb,duckdb) importFrom(glue,glue) +importFrom(glue,glue_sql) importFrom(httr,GET) importFrom(httr,HEAD) importFrom(httr,modify_url) @@ -61,6 +62,7 @@ importFrom(purrr,pmap_chr) importFrom(purrr,reduce) importFrom(purrr,set_names) importFrom(purrr,walk) +importFrom(purrr,walk2) importFrom(rlang,.data) importFrom(stats,setNames) importFrom(stringr,str_remove_all) diff --git a/R/counts.R b/R/counts.R index f8c9a93..8e06663 100644 --- a/R/counts.R +++ b/R/counts.R @@ -112,7 +112,7 @@ get_single_cell_experiment <- function( has_name(raw_data, c("cell_", "file_id_db")) ) - versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION) + versioned_cache_directory <- cache_directory versioned_cache_directory |> dir.create( showWarnings = FALSE, recursive = TRUE diff --git a/R/dev.R b/R/dev.R index a30d4cd..18a9b2c 100644 --- a/R/dev.R +++ b/R/dev.R @@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){ #' @keywords internal #' @return A character vector of the newly-created anndata files #' @examples -#' \donttest{ +#' \dontrun{ #' hdf5_to_anndata( #' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1", #' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1" @@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){ # @return A character vector of the newly-created anndata files # @noRd # @examples -# \donttest{ +# \dontrun{ # h5seurat_to_anndata( # "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1", # "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1" diff --git a/R/metadata.R b/R/metadata.R index b4adc52..9e6ca03 100644 --- a/R/metadata.R +++ b/R/metadata.R @@ -9,16 +9,20 @@ cache <- rlang::env( metadata_table = rlang::env() ) -#' URL pointing to the full metadata file +#' Returns the URLs for all metadata files #' @export -#' @return A character scalar consisting of the URL +#' @return A named character vector whose names are parquet file names, and whose values are URLs #' @examples -#' get_metadata(remote_url = DATABASE_URL) -DATABASE_URL <- single_line_str( - "https://object-store.rc.nectar.org.au/v1/ - AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet" -) +#' get_metadata(remote_url = get_database_url("metadata.0.2.3.parquet")) + +get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) { + glue::glue( + "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}" + ) +} + + #' URL pointing to the sample metadata file, which is smaller and for test, #' demonstration, and vignette purposes only #' @export @@ -38,8 +42,8 @@ SAMPLE_DATABASE_URL <- single_line_str( #' into [get_single_cell_experiment()] to obtain a #' [`SingleCellExperiment::SingleCellExperiment-class`] #' -#' @param remote_url Optional character vector of length 1. An HTTP URL pointing -#' to the location of the parquet database. +#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing +#' to the name and location of parquet database/databases. #' @param cache_directory Optional character vector of length 1. A file path on #' your local system to a directory (not a file) that will be used to store #' `metadata.parquet` @@ -68,6 +72,7 @@ SAMPLE_DATABASE_URL <- single_line_str( #' @importFrom httr progress #' @importFrom cli cli_alert_info hash_sha256 #' @importFrom glue glue +#' @importFrom purrr walk #' #' @details #' @@ -139,34 +144,40 @@ SAMPLE_DATABASE_URL <- single_line_str( #' The solution is to choose a different cache, for example #' ```R #' get_metadata(cache_directory = path.expand('~')) -#' ``` get_metadata <- function( - remote_url = DATABASE_URL, + remote_url = get_database_url(), cache_directory = get_default_cache_dir(), use_cache = TRUE ) { - hash <- c(remote_url, cache_directory) |> paste0(collapse="") |> - hash_sha256() - cached_connection <- cache$metadata_table[[hash]] - if (!is.null(cached_connection) && isTRUE(use_cache)) { - cached_connection - } - else { - db_path <- file.path(cache_directory, "metadata.0.2.3.parquet") - - if (!file.exists(db_path)){ - report_file_sizes(remote_url) - sync_remote_file( - remote_url, - db_path, - progress(type = "down", con = stderr()) - ) + # Download remote files if needed + walk(remote_url, function(url) { + path <- file.path(cache_directory, url |> basename()) + if (!file.exists(path)) { + report_file_sizes(url) + sync_remote_file(url, + path, + progress(type = "down", con = stderr())) } + }) + + all_metadata <- list.files(cache_directory, pattern="*.parquet") + hash <- all_metadata |> + paste0(collapse="") |> + hash_sha256() - table <- duckdb() |> - dbConnect(drv = _, read_only = TRUE) |> - read_parquet(db_path) - cache$metadata_table[[hash]] <- table - table + cached_connection <- cache$metadata_table[[hash]] + if (!is.null(cached_connection) && isTRUE(use_cache)) { + # Get hashed copy of the table if no parquet files have changed + cached_connection + } + else{ + # Load all parquet files as data frame + table <- duckdb() |> + dbConnect(drv = _, read_only = TRUE) |> + read_parquet() + cache$metadata_table[[hash]] <- table + table + } } } + diff --git a/R/unharmonised.R b/R/unharmonised.R index 7fa2730..48f4870 100644 --- a/R/unharmonised.R +++ b/R/unharmonised.R @@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str( #' @return A named list, where each name is a dataset file ID, and each value is #' a "lazy data frame", ie a `tbl`. #' @examples -#' \donttest{ +#' \dontrun{ #' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b" #' harmonised_meta <- get_metadata() |> #' dplyr::filter(file_id == dataset) |> dplyr::collect() @@ -54,7 +54,6 @@ get_unharmonised_dataset <- function( ){ unharmonised_root <- file.path( cache_directory, - COUNTS_VERSION, "unharmonised" ) file_name <- glue::glue("{dataset_id}.parquet") diff --git a/R/utils.R b/R/utils.R index b551aa8..7140a28 100644 --- a/R/utils.R +++ b/R/utils.R @@ -41,7 +41,7 @@ single_line_str <- function(text){ str_remove_all(text, r"(\n\s*)") } -#' Returns the default cache directory +#' Returns the default cache directory with a version number #' @return A length one character vector. #' @importFrom tools R_user_dir #' @importFrom utils packageName @@ -51,6 +51,7 @@ get_default_cache_dir <- function() { R_user_dir( "cache" ) |> + file.path(COUNTS_VERSION) |> normalizePath() |> suppressWarnings() } @@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) { #' @importFrom glue glue #' @importFrom dplyr tbl #' @importFrom dbplyr sql +#' @importFrom glue glue_sql #' @return An SQL data frame #' @keywords internal read_parquet <- function(conn, path){ - from_clause <- glue("FROM read_parquet('{path}')") |> sql() + from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql() tbl(conn, from_clause) } diff --git a/man/DATABASE_URL.Rd b/man/DATABASE_URL.Rd index fa9818f..7dbb8de 100644 --- a/man/DATABASE_URL.Rd +++ b/man/DATABASE_URL.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/metadata.R \docType{data} -\name{DATABASE_URL} -\alias{DATABASE_URL} +\name{get_database_url} +\alias{get_database_url} \title{URL pointing to the full metadata file} \format{ An object of class \code{character} of length 1. } \usage{ -DATABASE_URL +get_database_url } \value{ A character scalar consisting of the URL @@ -17,6 +17,6 @@ A character scalar consisting of the URL URL pointing to the full metadata file } \examples{ -get_metadata(remote_url = DATABASE_URL) +get_metadata(remote_url = get_database_url()) } \keyword{datasets} diff --git a/man/get_metadata.Rd b/man/get_metadata.Rd index dee6052..88f69d2 100644 --- a/man/get_metadata.Rd +++ b/man/get_metadata.Rd @@ -5,7 +5,7 @@ \title{Gets the Curated Atlas metadata as a data frame.} \usage{ get_metadata( - remote_url = DATABASE_URL, + remote_url = get_database_url(), cache_directory = get_default_cache_dir(), use_cache = TRUE ) diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd index 261793c..3402325 100644 --- a/man/get_unharmonised_dataset.Rd +++ b/man/get_unharmonised_dataset.Rd @@ -40,7 +40,7 @@ make sense for these to live in the main metadata table. This function is a utility that allows easy fetching of this data if necessary. } \examples{ -\donttest{ +\dontrun{ dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b" harmonised_meta <- get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() diff --git a/man/hdf5_to_anndata.Rd b/man/hdf5_to_anndata.Rd index a1ef720..586fed5 100644 --- a/man/hdf5_to_anndata.Rd +++ b/man/hdf5_to_anndata.Rd @@ -20,7 +20,7 @@ A character vector of the newly-created anndata files Converts a series of HDF5Array-serialized SingleCellExperiments to AnnData } \examples{ -\donttest{ +\dontrun{ hdf5_to_anndata( "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1", "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1" diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 6e1610a..313d121 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -19,7 +19,7 @@ test_that("get_default_cache_dir() returns the correct directory on Linux", { grepl("linux", version$platform, fixed = TRUE) |> skip_if_not() - "~/.cache/R/CuratedAtlasQueryR" |> + "~/.cache/R/CuratedAtlasQueryR/0.2.1" |> normalizePath() |> expect_equal( get_default_cache_dir(), @@ -131,7 +131,7 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", { # Load the SCE from cache directly assay_1 = CuratedAtlasQueryR:::get_default_cache_dir() |> - file.path(CuratedAtlasQueryR:::COUNTS_VERSION, "original", id) |> + file.path("original", id) |> HDF5Array::loadHDF5SummarizedExperiment() |> assay("X") |> as.matrix() @@ -190,3 +190,18 @@ test_that("get_metadata() is cached", { identical(table, table_2) |> expect_true() }) + +test_that("database_url() expect character ", { + get_database_url() |> + expect_s3_class("character") +}) + + +test_that("get_metadata() expect a unique cell_type `b` is present, which comes from fibrosis database", { + n_cell <- get_metadata() |> filter(cell_type_harmonised == 'b') |> as_tibble() |> nrow() + expect_true(n_cell > 0) +}) + + + + diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd index dbba7f5..916e3d6 100644 --- a/vignettes/Introduction.Rmd +++ b/vignettes/Introduction.Rmd @@ -51,7 +51,7 @@ find_figure <- function(names){ } METADATA_URL = if (params$demo_metadata) CuratedAtlasQueryR::SAMPLE_DATABASE_URL else - CuratedAtlasQueryR::DATABASE_URL + CuratedAtlasQueryR::get_database_url ```