Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update metadata function to allow interacting multiple databases #130

Closed
wants to merge 13 commits into from
Closed
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(as.sparse,DelayedMatrix)
export(DATABASE_URL)
export(get_database_url)
export(SAMPLE_DATABASE_URL)
export(get_SingleCellExperiment)
export(get_metadata)
Expand Down Expand Up @@ -43,6 +43,7 @@ importFrom(dplyr,tbl)
importFrom(dplyr,transmute)
importFrom(duckdb,duckdb)
importFrom(glue,glue)
importFrom(glue,glue_sql)
importFrom(httr,GET)
importFrom(httr,HEAD)
importFrom(httr,modify_url)
Expand All @@ -61,6 +62,7 @@ importFrom(purrr,pmap_chr)
importFrom(purrr,reduce)
importFrom(purrr,set_names)
importFrom(purrr,walk)
importFrom(purrr,walk2)
importFrom(rlang,.data)
importFrom(stats,setNames)
importFrom(stringr,str_remove_all)
Expand Down
2 changes: 1 addition & 1 deletion R/counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ get_single_cell_experiment <- function(
has_name(raw_data, c("cell_", "file_id_db"))
)

versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
myushen marked this conversation as resolved.
Show resolved Hide resolved
versioned_cache_directory <- cache_directory
versioned_cache_directory |> dir.create(
showWarnings = FALSE,
recursive = TRUE
Expand Down
4 changes: 2 additions & 2 deletions R/dev.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
#' @keywords internal
#' @return A character vector of the newly-created anndata files
#' @examples
#' \donttest{
#' \dontrun{
#' hdf5_to_anndata(
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
Expand Down Expand Up @@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){
# @return A character vector of the newly-created anndata files
# @noRd
# @examples
# \donttest{
# \dontrun{
# h5seurat_to_anndata(
# "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
# "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
Expand Down
46 changes: 27 additions & 19 deletions R/metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,21 @@ cache <- rlang::env(
metadata_table = rlang::env()
)

#' URL pointing to the full metadata file
#' Returns the URLs for all metadata files
#' @export
#' @return A character scalar consisting of the URL
#' @return A named character vector whose names are parquet file names, and whose values are URLs
#' @examples
#' get_metadata(remote_url = DATABASE_URL)
DATABASE_URL <- single_line_str(
"https://object-store.rc.nectar.org.au/v1/
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
)
#' get_metadata(remote_url = get_database_url("metadata.0.2.3.parquet"))


get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
glue::glue(
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}"
) |>
setNames(databases)
}


#' URL pointing to the sample metadata file, which is smaller and for test,
#' demonstration, and vignette purposes only
#' @export
Expand All @@ -38,8 +43,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' into [get_single_cell_experiment()] to obtain a
#' [`SingleCellExperiment::SingleCellExperiment-class`]
#'
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
#' to the location of the parquet database.
#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
#' to the name and location of parquet database/databases.
#' @param cache_directory Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store
#' `metadata.parquet`
Expand Down Expand Up @@ -68,6 +73,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' @importFrom httr progress
#' @importFrom cli cli_alert_info hash_sha256
#' @importFrom glue glue
#' @importFrom purrr walk2
#'
#' @details
#'
Expand Down Expand Up @@ -141,8 +147,10 @@ SAMPLE_DATABASE_URL <- single_line_str(
#'
#' get_metadata(cache_directory = path.expand('~'))
#'


get_metadata <- function(
remote_url = DATABASE_URL,
remote_url = get_database_url(),
cache_directory = get_default_cache_dir(),
use_cache = TRUE
) {
Expand All @@ -153,16 +161,15 @@ get_metadata <- function(
cached_connection
}
else {
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")

if (!file.exists(db_path)){
report_file_sizes(remote_url)
sync_remote_file(
remote_url,
db_path,
progress(type = "down", con = stderr())
)
db_path <- file.path(cache_directory, remote_url |> basename())
walk2(remote_url, db_path, function(url, path) {
if (!file.exists(path)) {
report_file_sizes(url)
sync_remote_file(url,
path,
progress(type = "down", con = stderr()))
}
})

table <- duckdb() |>
dbConnect(drv = _, read_only = TRUE) |>
Expand All @@ -171,3 +178,4 @@ get_metadata <- function(
table
}
}

3 changes: 1 addition & 2 deletions R/unharmonised.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str(
#' @return A named list, where each name is a dataset file ID, and each value is
#' a "lazy data frame", ie a `tbl`.
#' @examples
#' \donttest{
#' \dontrun{
#' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b"
#' harmonised_meta <- get_metadata() |>
#' dplyr::filter(file_id == dataset) |> dplyr::collect()
Expand All @@ -54,7 +54,6 @@ get_unharmonised_dataset <- function(
){
unharmonised_root <- file.path(
cache_directory,
COUNTS_VERSION,
"unharmonised"
)
file_name <- glue::glue("{dataset_id}.parquet")
Expand Down
6 changes: 4 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ single_line_str <- function(text){
str_remove_all(text, r"(\n\s*)")
}

#' Returns the default cache directory
#' Returns the default cache directory with a version number
#' @return A length one character vector.
#' @importFrom tools R_user_dir
#' @importFrom utils packageName
Expand All @@ -51,6 +51,7 @@ get_default_cache_dir <- function() {
R_user_dir(
"cache"
) |>
file.path(COUNTS_VERSION) |>
normalizePath() |>
suppressWarnings()
}
Expand Down Expand Up @@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) {
#' @importFrom glue glue
#' @importFrom dplyr tbl
#' @importFrom dbplyr sql
#' @importFrom glue glue_sql
#' @return An SQL data frame
#' @keywords internal
read_parquet <- function(conn, path){
from_clause <- glue("FROM read_parquet('{path}')") |> sql()
from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql()
multimeric marked this conversation as resolved.
Show resolved Hide resolved
tbl(conn, from_clause)
}

Expand Down
8 changes: 4 additions & 4 deletions man/DATABASE_URL.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/get_metadata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/get_unharmonised_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/hdf5_to_anndata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 16 additions & 1 deletion tests/testthat/test-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {

# Load the SCE from cache directly
assay_1 = CuratedAtlasQueryR:::get_default_cache_dir() |>
file.path(CuratedAtlasQueryR:::COUNTS_VERSION, "original", id) |>
file.path("original", id) |>
HDF5Array::loadHDF5SummarizedExperiment() |>
assay("X") |>
as.matrix()
Expand Down Expand Up @@ -190,3 +190,18 @@ test_that("get_metadata() is cached", {

identical(table, table_2) |> expect_true()
})

test_that("database_url() expect character ", {
get_database_url() |>
expect_s3_class("character")
})


test_that("get_metadata() expect a unique cell_type `b` is present, which comes from fibrosis database", {
n_cell <- get_metadata() |> filter(cell_type_harmonised == 'b') |> as_tibble() |> nrow()
expect_true(n_cell > 0)
})




2 changes: 1 addition & 1 deletion vignettes/Introduction.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ find_figure <- function(names){
}
METADATA_URL = if (params$demo_metadata)
CuratedAtlasQueryR::SAMPLE_DATABASE_URL else
CuratedAtlasQueryR::DATABASE_URL
CuratedAtlasQueryR::get_database_url
```

<!-- badges: start -->
Expand Down
Loading