Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman committed Nov 15, 2024
1 parent b53151f commit aa5c258
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 56 deletions.
26 changes: 12 additions & 14 deletions R/agglomerate.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
#' regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be
#' removed if \code{na.rm = TRUE} before agglomeration.
#'
#' @param empty.rows.rm \code{Logical scalar}. Defines whether rows including
#' @param empty.rm \code{Logical scalar}. Defines whether rows including
#' \code{empty.fields} in specified \code{rank} will be excluded.
#' (Default: \code{TRUE})
#'
Expand All @@ -69,7 +69,7 @@
#' whether to remove those columns of rowData that include only NAs after
#' agglomeration. (Default: \code{FALSE})
#'
#' \item \code{group.rm}: \code{Logical scalar}. Determines
#' \item \code{empty.rm}: \code{Logical scalar}. Determines
#' whether to remove rows that do not belong to any group, i.e., that
#' have \code{NA} value. (Default: \code{FALSE})
#'
Expand Down Expand Up @@ -160,9 +160,9 @@
#' tse <- agglomerateByRank(tse, rank = "Genus")
#' tse <- transformAssay(tse, method = "pa")
#'
#' # removing empty labels by setting empty.rows.rm = TRUE
#' # removing empty labels by setting empty.rm = TRUE
#' sum(is.na(rowData(GlobalPatterns)$Family))
#' x3 <- agglomerateByRank(GlobalPatterns, rank="Family", empty.rows.rm = TRUE)
#' x3 <- agglomerateByRank(GlobalPatterns, rank="Family", empty.rm = TRUE)
#' nrow(x3) # different from x2
#'
#' # Because all the rownames are from the same rank, rownames do not include
Expand Down Expand Up @@ -285,7 +285,7 @@ setMethod(
#' @importFrom SummarizedExperiment rowData rowData<-
#' @export
setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
function(x, rank = taxonomyRanks(x)[1], empty.rows.rm = TRUE,
function(x, rank = taxonomyRanks(x)[1], empty.rm = TRUE,
empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){
# Input check
if(nrow(x) == 0L){
Expand All @@ -296,8 +296,8 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
stop("'rank' must be a non-empty single character value",
call. = FALSE)
}
if(!.is_a_bool(empty.rows.rm)){
stop("'empty.rows.rm' must be TRUE or FALSE.", call. = FALSE)
if(!.is_a_bool(empty.rm)){
stop("'empty.rm' must be TRUE or FALSE.", call. = FALSE)
}
if(ncol(rowData(x)) == 0L){
stop("taxonomyData needs to be populated.", call. = FALSE)
Expand All @@ -311,10 +311,10 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
# Get the indices of detected rank columns from rowData
tax_cols <- .get_tax_cols_from_se(x)

# if empty.rows.rm is TRUE, remove those rows that have empty,
# if empty.rm is TRUE, remove those rows that have empty,
# white-space, NA values in rank information. I.e., they do not have
# taxonomy information in specified taxonomy level.
if( empty.rows.rm ){
if( empty.rm ){
x <- .remove_with_empty_taxonomic_info(
x, tax_cols[col_idx], empty.fields)
}
Expand All @@ -329,16 +329,14 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
# Get groups of taxonomy entries, i.e., get the specified rank
# column from rowData
tax_factors <- .get_tax_groups(x, col = col_idx, ...)
# Convert to factors. Use group.rm so that NA values are not
# Convert to factors. Use empty.rm so that NA values are not
# preserved. i.e. they are not converted into character values.
# NA values are handled earlier in this function.
tax_factors <- .norm_f(nrow(x), tax_factors, group.rm = TRUE)
tax_factors <- .norm_f(nrow(x), tax_factors, empty.rm = TRUE)

# Agglomerate data by utilizing agglomerateByVariable
args <- c(list(
x, by = "rows", group = tax_factors, group.rm = TRUE),
list(...)[ !names(list(...)) %in% c("group.rm") ]
)
x, by = "rows", group = tax_factors, empty.rm = TRUE), list(...))
x <- do.call(agglomerateByVariable, args)

# Replace the values to the right of the rank with NA_character_.
Expand Down
8 changes: 4 additions & 4 deletions R/merge.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# This function can be used to unify the group id vector. It can be any
# kind of vector, but this converts it to factor.
.norm_f <- function(
i, f, dim.type = c("rows","columns"), group.rm = FALSE, ...){
if(!.is_a_bool(group.rm)){
stop("'group.rm' must be TRUE or FALSE.", call. = FALSE)
i, f, dim.type = c("rows","columns"), empty.rm = FALSE, ...){
if(!.is_a_bool(empty.rm)){
stop("'empty.rm' must be TRUE or FALSE.", call. = FALSE)
}
dim.type <- match.arg(dim.type)
if(!is.character(f) && !is.factor(f)){
Expand All @@ -16,7 +16,7 @@
call. = FALSE)
}
# This is done otherwise we lose NA values
if( !group.rm && any(is.na(f)) ){
if( !empty.rm && any(is.na(f)) ){
f <- as.character(f)
f[ is.na(f) ] <- "NA"
}
Expand Down
8 changes: 4 additions & 4 deletions man/agglomerate-methods.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 24 additions & 24 deletions tests/testthat/test-3agglomerate.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,27 @@ test_that("agglomerate", {
expect_equal(assays(actual)$mat[2,1],c(b = 36))
expect_equal(assays(actual)$mat[3,1],c(c = 24))
#
expect_error(agglomerateByRank(xtse,"",empty.rows.rm=FALSE),
expect_error(agglomerateByRank(xtse,"",empty.rm=FALSE),
"'rank' must be a non-empty single character value")
expect_error(agglomerateByRank(xtse,"Family",empty.rows.rm=""),
"'empty.rows.rm' must be TRUE or FALSE")
expect_error(agglomerateByRank(xtse,"Family",empty.rm=""),
"'empty.rm' must be TRUE or FALSE")
expect_error(
agglomerateByRank(xtse,"Family",empty.rows.rm=FALSE,update.tree=""),
agglomerateByRank(xtse,"Family",empty.rm=FALSE,update.tree=""),
"'update.tree' must be TRUE or FALSE")
xtse2 <- xtse
rowData(xtse2) <- NULL
expect_error(agglomerateByRank(xtse2,"Family",empty.rows.rm=FALSE),
expect_error(agglomerateByRank(xtse2,"Family",empty.rm=FALSE),
"taxonomyData needs to be populated")
#
actual <- agglomerateByRank(xtse,"Family",empty.rows.rm=FALSE)
actual <- agglomerateByRank(xtse,"Family",empty.rm=FALSE)
expect_equivalent(rowData(actual),rowData(actual_family))
actual <- agglomerateByRank(xtse,"Phylum",empty.rows.rm=FALSE)
actual <- agglomerateByRank(xtse,"Phylum",empty.rm=FALSE)
expect_equivalent(rowData(actual),rowData(actual_phylum))
#
actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, empty.rows.rm = TRUE)
actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, empty.rm = TRUE)
expect_equal(dim(actual),c(6,10))
expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h"))
actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, empty.rows.rm = FALSE)
actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, empty.rm = FALSE)
expect_equal(dim(actual),c(8,10))
expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h",NA,NA))
actual <- agglomerateByRank(xtse,"Phylum")
Expand All @@ -63,22 +63,22 @@ test_that("agglomerate", {
data(enterotype, package="mia")
expect_equal(length(unique(rowData(enterotype)[,"Genus"])),
nrow(agglomerateByRank(enterotype,"Genus", ignore.taxonomy = FALSE,
empty.rows.rm = FALSE)))
empty.rm = FALSE)))

# agglomeration in all its forms
data(GlobalPatterns, package="mia")
se <- GlobalPatterns
actual <- agglomerateByRank(se, rank = "Family",
ignore.taxonomy = FALSE, empty.rows.rm = FALSE)
ignore.taxonomy = FALSE, empty.rm = FALSE)
expect_equal(dim(actual),c(603,26))
expect_equal(length(rowTree(actual)$tip.label),
length(rowTree(se)$tip.label))
actual <- agglomerateByRank(se, rank = "Family",
ignore.taxonomy = FALSE, empty.rows.rm = FALSE, update.tree = TRUE)
ignore.taxonomy = FALSE, empty.rm = FALSE, update.tree = TRUE)
expect_equal(dim(actual),c(603,26))
expect_equal(length(rowTree(actual)$tip.label), 603)
actual <- agglomerateByRank(se, rank = "Family",
ignore.taxonomy = FALSE, empty.rows.rm = FALSE, update.tree = TRUE)
ignore.taxonomy = FALSE, empty.rm = FALSE, update.tree = TRUE)
expect_equal(dim(actual),c(603,26))
expect_equal(length(rowTree(actual)$tip.label), nrow(actual))
# Test that warning occurs when assay contian binary or negative values
Expand All @@ -92,30 +92,30 @@ test_that("agglomerate", {
data(GlobalPatterns, package="mia")
tse <- GlobalPatterns

# Check that empty.rows.rm works
# Check that empty.rm works
# Get all phyla
all_phyla <- unique( rowData(tse)$Phylum )

# When empty.rows.rm = FALSE, then phyla should also include NA --> one extra row
test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", group.rm = FALSE)
test1 <- agglomerateByRank(tse, rank = "Phylum", empty.rows.rm = FALSE)
# When empty.rm = FALSE, then phyla should also include NA --> one extra row
test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", empty.rm = FALSE)
test1 <- agglomerateByRank(tse, rank = "Phylum", empty.rm = FALSE)

# Test that dimentionality is the same for merging object by agglomerateByRank
# and agglomerateByVariable.
expect_equal(nrow(test0), length(all_phyla))
expect_equal(nrow(test1), length(all_phyla))

# When empty.rows.rm = TRUE, there should be as many rows as there are non-NA phyla
test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", group.rm = TRUE)
test1 <- agglomerateByRank(tse, rank = "Phylum", empty.rows.rm = TRUE)
# When empty.rm = TRUE, there should be as many rows as there are non-NA phyla
test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", empty.rm = TRUE)
test1 <- agglomerateByRank(tse, rank = "Phylum", empty.rm = TRUE)

# Test that dimensionality is the same when NA values are removed.
expect_equal(nrow(test0), length( all_phyla[!is.na(all_phyla)] ))
expect_equal(nrow(test1), length( all_phyla[!is.na(all_phyla)] ))

# Check that there are more taxa when agglomeration is to "Species" level
test0 <- agglomerateByVariable(tse, by = 1, group = "Species", group.rm = FALSE)
test1 <- agglomerateByRank(tse, rank = "Species", empty.rows.rm = FALSE)
test0 <- agglomerateByVariable(tse, by = 1, group = "Species", empty.rm = FALSE)
test1 <- agglomerateByRank(tse, rank = "Species", empty.rm = FALSE)
expect_equal(nrow(test0), 945)
expect_equal(nrow(test1), 2307)

Expand Down Expand Up @@ -143,9 +143,9 @@ test_that("agglomerate", {
expect_equal(rd1[, cols], rd2[, cols])
expect_true( ncol(rd1) > ncol(rd2) )
# Test that make.unique work
uniq <- agglomerateByRank(tse, rank = "Species", empty.rows.rm = FALSE)
uniq <- agglomerateByRank(tse, rank = "Species", empty.rm = FALSE)
not_uniq <- agglomerateByRank(
tse, rank = "Species", make.unique = FALSE, empty.rows.rm = FALSE)
tse, rank = "Species", make.unique = FALSE, empty.rm = FALSE)
expect_true( !any( duplicated(rownames(uniq)) ) )
expect_true( any( duplicated(rownames(not_uniq)) ) )

Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/test-5dominantTaxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ test_that("getDominant", {
"Order:Stramenopiles","Order:Stramenopiles","Order:Stramenopiles")
names(exp.vals.two) <- exp.names.one
expect_equal(
getDominant(tse, rank = "Genus", ignore.taxonomy = FALSE, empty.rows.rm = FALSE)[1:15],
getDominant(tse, rank = "Genus", ignore.taxonomy = FALSE, empty.rm = FALSE)[1:15],
exp.vals.two)

# Check if DominantTaxa is added to coldata
expect_equal(
colData(addDominant(tse, name="dominant"))$dominant[1:15],
exp.vals.one)
expect_equal(
colData(addDominant(tse,rank = "Genus", empty.rows.rm = FALSE, name="dominant"))$dominant[1:15],
colData(addDominant(tse,rank = "Genus", empty.rm = FALSE, name="dominant"))$dominant[1:15],
exp.vals.two)

# Check if DominantTaxa is added when factor is passed
Expand All @@ -52,7 +52,7 @@ test_that("getDominant", {
test <- tse
rowData(test)$group <- rowData(tse)$Genus
expect_equal(
colData(addDominant(test, rank = "group", group.rm = TRUE, name = "dominant"))$dominant[1:15],
colData(addDominant(test, rank = "group", empty.rm = TRUE, name = "dominant"))$dominant[1:15],
exp.vals.three)

tse1 <- tse
Expand Down
8 changes: 4 additions & 4 deletions tests/testthat/test-5prevalence.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,16 @@ test_that("getPrevalence", {
remove <- c(15, 200)
assay(tse, "counts")[remove, ] <- NA
# Check that agglomeration works
tse_agg <- agglomerateByRank(tse, ignore.taxonomy = FALSE, empty.rows.rm = FALSE, rank = rank)
tse_agg <- agglomerateByRank(tse, ignore.taxonomy = FALSE, empty.rm = FALSE, rank = rank)
expect_warning(ref <- getPrevalence(tse_agg, na.rm = FALSE))
expect_warning(res <- getPrevalence(tse, rank = "Genus", empty.rows.rm = FALSE))
expect_warning(res <- getPrevalence(tse, rank = "Genus", empty.rm = FALSE))
expect_true( all(res == ref, na.rm = TRUE) )
#
tse_agg <- agglomerateByRank(
tse, ignore.taxonomy = FALSE, empty.rows.rm = TRUE, rank = rank)
tse, ignore.taxonomy = FALSE, empty.rm = TRUE, rank = rank)
ref <- getPrevalence(tse_agg, na.rm = TRUE)
res <- getPrevalence(
tse, na.rm = TRUE, rank = "Genus", empty.rows.rm = TRUE)
tse, na.rm = TRUE, rank = "Genus", empty.rm = TRUE)
expect_true( all(res == ref, na.rm = TRUE) )
})

Expand Down
6 changes: 3 additions & 3 deletions vignettes/mia.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,13 @@ becomes very easy.
altExp(tse, "family") <- x2
```

Keep in mind, that if you set `empty.rows.rm = TRUE`, rows with `NA` or similar value
Keep in mind, that if you set `empty.rm = TRUE`, rows with `NA` or similar value
(defined via the `empty.fields` argument) will be removed. Depending on these
settings different number of rows will be returned.

```{r}
x1 <- agglomerateByRank(tse, rank = "Species", empty.rows.rm = TRUE)
altExp(tse,"species") <- agglomerateByRank(tse, rank = "Species", empty.rows.rm = FALSE)
x1 <- agglomerateByRank(tse, rank = "Species", empty.rm = TRUE)
altExp(tse,"species") <- agglomerateByRank(tse, rank = "Species", empty.rm = FALSE)
dim(x1)
dim(altExp(tse,"species"))
```
Expand Down

0 comments on commit aa5c258

Please sign in to comment.