Skip to content

Commit

Permalink
Refactor get_metadata() to use arbitrary parquet files
Browse files Browse the repository at this point in the history
Conflicts:
	R/metadata.R
  • Loading branch information
multimeric committed Feb 22, 2024
2 parents fd2009a + 78b8c0f commit 7efdf77
Show file tree
Hide file tree
Showing 12 changed files with 79 additions and 50 deletions.
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(as.sparse,DelayedMatrix)
export(DATABASE_URL)
export(get_database_url)
export(SAMPLE_DATABASE_URL)
export(get_SingleCellExperiment)
export(get_metadata)
Expand Down Expand Up @@ -43,6 +43,7 @@ importFrom(dplyr,tbl)
importFrom(dplyr,transmute)
importFrom(duckdb,duckdb)
importFrom(glue,glue)
importFrom(glue,glue_sql)
importFrom(httr,GET)
importFrom(httr,HEAD)
importFrom(httr,modify_url)
Expand All @@ -61,6 +62,7 @@ importFrom(purrr,pmap_chr)
importFrom(purrr,reduce)
importFrom(purrr,set_names)
importFrom(purrr,walk)
importFrom(purrr,walk2)
importFrom(rlang,.data)
importFrom(stats,setNames)
importFrom(stringr,str_remove_all)
Expand Down
2 changes: 1 addition & 1 deletion R/counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ get_single_cell_experiment <- function(
has_name(raw_data, c("cell_", "file_id_db"))
)

versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
versioned_cache_directory <- cache_directory
versioned_cache_directory |> dir.create(
showWarnings = FALSE,
recursive = TRUE
Expand Down
4 changes: 2 additions & 2 deletions R/dev.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
#' @keywords internal
#' @return A character vector of the newly-created anndata files
#' @examples
#' \donttest{
#' \dontrun{
#' hdf5_to_anndata(
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
Expand Down Expand Up @@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){
# @return A character vector of the newly-created anndata files
# @noRd
# @examples
# \donttest{
# \dontrun{
# h5seurat_to_anndata(
# "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
# "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
Expand Down
75 changes: 43 additions & 32 deletions R/metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,20 @@ cache <- rlang::env(
metadata_table = rlang::env()
)

#' URL pointing to the full metadata file
#' Returns the URLs for all metadata files
#' @export
#' @return A character scalar consisting of the URL
#' @return A named character vector whose names are parquet file names, and whose values are URLs
#' @examples
#' get_metadata(remote_url = DATABASE_URL)
DATABASE_URL <- single_line_str(
"https://object-store.rc.nectar.org.au/v1/
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
)
#' get_metadata(remote_url = get_database_url("metadata.0.2.3.parquet"))


get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
glue::glue(
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}"
)
}


#' URL pointing to the sample metadata file, which is smaller and for test,
#' demonstration, and vignette purposes only
#' @export
Expand All @@ -38,8 +42,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' into [get_single_cell_experiment()] to obtain a
#' [`SingleCellExperiment::SingleCellExperiment-class`]
#'
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
#' to the location of the parquet database.
#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
#' to the name and location of parquet database/databases.
#' @param cache_directory Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store
#' `metadata.parquet`
Expand Down Expand Up @@ -68,6 +72,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' @importFrom httr progress
#' @importFrom cli cli_alert_info hash_sha256
#' @importFrom glue glue
#' @importFrom purrr walk
#'
#' @details
#'
Expand Down Expand Up @@ -139,34 +144,40 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' The solution is to choose a different cache, for example
#' ```R
#' get_metadata(cache_directory = path.expand('~'))
#' ```
get_metadata <- function(
remote_url = DATABASE_URL,
remote_url = get_database_url(),
cache_directory = get_default_cache_dir(),
use_cache = TRUE
) {
hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
hash_sha256()
cached_connection <- cache$metadata_table[[hash]]
if (!is.null(cached_connection) && isTRUE(use_cache)) {
cached_connection
}
else {
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")

if (!file.exists(db_path)){
report_file_sizes(remote_url)
sync_remote_file(
remote_url,
db_path,
progress(type = "down", con = stderr())
)
# Download remote files if needed
walk(remote_url, function(url) {
path <- file.path(cache_directory, url |> basename())
if (!file.exists(path)) {
report_file_sizes(url)
sync_remote_file(url,
path,
progress(type = "down", con = stderr()))
}
})

all_metadata <- list.files(cache_directory, pattern="*.parquet")
hash <- all_metadata |>
paste0(collapse="") |>
hash_sha256()

table <- duckdb() |>
dbConnect(drv = _, read_only = TRUE) |>
read_parquet(db_path)
cache$metadata_table[[hash]] <- table
table
cached_connection <- cache$metadata_table[[hash]]
if (!is.null(cached_connection) && isTRUE(use_cache)) {
# Get hashed copy of the table if no parquet files have changed
cached_connection
}
else{
# Load all parquet files as data frame
table <- duckdb() |>
dbConnect(drv = _, read_only = TRUE) |>
read_parquet()
cache$metadata_table[[hash]] <- table
table
}
}
}

3 changes: 1 addition & 2 deletions R/unharmonised.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str(
#' @return A named list, where each name is a dataset file ID, and each value is
#' a "lazy data frame", ie a `tbl`.
#' @examples
#' \donttest{
#' \dontrun{
#' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b"
#' harmonised_meta <- get_metadata() |>
#' dplyr::filter(file_id == dataset) |> dplyr::collect()
Expand All @@ -54,7 +54,6 @@ get_unharmonised_dataset <- function(
){
unharmonised_root <- file.path(
cache_directory,
COUNTS_VERSION,
"unharmonised"
)
file_name <- glue::glue("{dataset_id}.parquet")
Expand Down
6 changes: 4 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ single_line_str <- function(text){
str_remove_all(text, r"(\n\s*)")
}

#' Returns the default cache directory
#' Returns the default cache directory with a version number
#' @return A length one character vector.
#' @importFrom tools R_user_dir
#' @importFrom utils packageName
Expand All @@ -51,6 +51,7 @@ get_default_cache_dir <- function() {
R_user_dir(
"cache"
) |>
file.path(COUNTS_VERSION) |>
normalizePath() |>
suppressWarnings()
}
Expand Down Expand Up @@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) {
#' @importFrom glue glue
#' @importFrom dplyr tbl
#' @importFrom dbplyr sql
#' @importFrom glue glue_sql
#' @return An SQL data frame
#' @keywords internal
read_parquet <- function(conn, path){
from_clause <- glue("FROM read_parquet('{path}')") |> sql()
from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql()
tbl(conn, from_clause)
}

Expand Down
8 changes: 4 additions & 4 deletions man/DATABASE_URL.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/get_metadata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/get_unharmonised_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/hdf5_to_anndata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 17 additions & 2 deletions tests/testthat/test-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ test_that("get_default_cache_dir() returns the correct directory on Linux", {
grepl("linux", version$platform, fixed = TRUE) |>
skip_if_not()

"~/.cache/R/CuratedAtlasQueryR" |>
"~/.cache/R/CuratedAtlasQueryR/0.2.1" |>
normalizePath() |>
expect_equal(
get_default_cache_dir(),
Expand Down Expand Up @@ -131,7 +131,7 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {

# Load the SCE from cache directly
assay_1 = CuratedAtlasQueryR:::get_default_cache_dir() |>
file.path(CuratedAtlasQueryR:::COUNTS_VERSION, "original", id) |>
file.path("original", id) |>
HDF5Array::loadHDF5SummarizedExperiment() |>
assay("X") |>
as.matrix()
Expand Down Expand Up @@ -190,3 +190,18 @@ test_that("get_metadata() is cached", {

identical(table, table_2) |> expect_true()
})

test_that("database_url() expect character ", {
get_database_url() |>
expect_s3_class("character")
})


test_that("get_metadata() expect a unique cell_type `b` is present, which comes from fibrosis database", {
n_cell <- get_metadata() |> filter(cell_type_harmonised == 'b') |> as_tibble() |> nrow()
expect_true(n_cell > 0)
})




2 changes: 1 addition & 1 deletion vignettes/Introduction.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ find_figure <- function(names){
}
METADATA_URL = if (params$demo_metadata)
CuratedAtlasQueryR::SAMPLE_DATABASE_URL else
CuratedAtlasQueryR::DATABASE_URL
CuratedAtlasQueryR::get_database_url
```

<!-- badges: start -->
Expand Down

0 comments on commit 7efdf77

Please sign in to comment.