Skip to content

New any_duplicated_linter #986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Collate:
'aaa.R'
'actions.R'
'addins.R'
'any_duplicated_linter.R'
'any_is_na_linter.R'
'assignment_linter.R'
'backport_linter.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(T_and_F_symbol_linter)
export(absolute_path_linter)
export(all_undesirable_functions)
export(all_undesirable_operators)
export(any_duplicated_linter)
export(any_is_na_linter)
export(assignment_linter)
export(available_linters)
Expand Down
116 changes: 116 additions & 0 deletions R/any_duplicated_linter.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#' Require usage of anyDuplicated() > 0 over any(duplicated(.))
#'
#' [anyDuplicated()] exists as a replacement for `any(duplicated(.))` which is
#' more efficient for simple objects, and in the worst case is the same
#' efficiency. Therefore it should be used in all situations instead of the
#' latter.
#'
#' Also match usage like `length(unique(x$col)) == nrow(x)`, which can
#' be replaced by `anyDuplicated(x$col) == 0L`.
#'
#' @evalRd rd_tags("any_duplicated_linter")
#' @seealso [linters] for a complete list of linters available in lintr.
#' @export
any_duplicated_linter <- function() {
Linter(function(source_file) {
if (length(source_file$xml_parsed_content) == 0L) {
return(list())
}

xml <- source_file$xml_parsed_content

any_duplicated_xpath <- "//expr[
expr[SYMBOL_FUNCTION_CALL[text() = 'any']]
and expr[expr[SYMBOL_FUNCTION_CALL[text() = 'duplicated']]]
and (
not(OP-COMMA)
or OP-COMMA[
not(preceding-sibling::OP-COMMA)
and following-sibling::SYMBOL_SUB[1][text() = 'na.rm']
]
)
]"

any_duplicated_expr <- xml2::xml_find_all(xml, any_duplicated_xpath)
any_duplicated_lints <- lapply(
any_duplicated_expr,
xml_nodes_to_lint,
source_file = source_file,
lint_message = "anyDuplicated(x, ...) > 0 is better than any(duplicated(x), ...).",
type = "warning"
)

# path from the expr of the unique() call to the call that needs to match.
# the final parent::expr/expr gets us to the expr on the other side of EQ;
# this lets us match on either side of EQ, where following-sibling
# assumes we are before EQ, preceding-sibling assumes we are after EQ.
path_to_neighbor_call_expr_fmt <- file.path(
"parent::expr",
"parent::expr",
"parent::expr",
"expr",
"expr[SYMBOL_FUNCTION_CALL[text()= '%s']]",
"following-sibling::expr"
)
unique_expr_xpath <- xp_and(
"SYMBOL_FUNCTION_CALL[text() = 'unique']",
# ensure the expr matches to avoid spurious match like
# length(unique(x)) == length(y)
xp_or(
# length(unique(x)) == length(x).
sprintf(
"following-sibling::expr = %s",
sprintf(path_to_neighbor_call_expr_fmt, "length")
),
# length(unique( << DF$col or DF[["col"]] >> )) == nrow(DF)
sprintf(
"following-sibling::expr[OP-DOLLAR or LBB]/expr[1] = %s",
sprintf(path_to_neighbor_call_expr_fmt, "nrow")
)
)
)
length_unique_call_xpath <- xp_and(
"expr[SYMBOL_FUNCTION_CALL[text() = 'length']]",
sprintf("expr[expr[%s]]", unique_expr_xpath)
)
# EQ ensures we're in an ==, !=, <, or > clause
length_unique_xpath <-
sprintf("//expr[EQ or NE or GT or LT]/expr[%s]", length_unique_call_xpath)
length_unique_xpath <- "
//expr[EQ or NE or GT or LT]
/expr[
expr[SYMBOL_FUNCTION_CALL[text() = 'length']]
and expr[expr[
SYMBOL_FUNCTION_CALL[text() = 'unique']
and (
following-sibling::expr =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[SYMBOL_FUNCTION_CALL[text()= 'length']]
/following-sibling::expr
or
following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[SYMBOL_FUNCTION_CALL[text()= 'nrow']]
/following-sibling::expr
)
]]
]"
length_unique_expr <- xml2::xml_find_all(xml, length_unique_xpath)
length_unique_lints <- lapply(
length_unique_expr,
xml_nodes_to_lint,
source_file = source_file,
lint_message =
"anyDuplicated(x) == 0L is better than length(unique(x)) == length(x) and length(unique(DF$col)) == nrow(DF)",
type = "warning"
)

return(c(any_duplicated_lints, length_unique_lints))
})
}
1 change: 1 addition & 0 deletions inst/lintr/linters.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
linter,tags
absolute_path_linter,robustness best_practices configurable
any_duplicated_linter,efficiency best_practices
any_is_na_linter,efficiency best_practices
assignment_linter,style consistency default
backport_linter,robustness configurable package_development
Expand Down
24 changes: 24 additions & 0 deletions man/any_duplicated_linter.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/best_practices_linters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/efficiency_linters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions man/linters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

107 changes: 107 additions & 0 deletions tests/testthat/test-any_duplicated_linter.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
test_that("any_duplicated_linter skips allowed usages", {
expect_lint("x <- any(y)", NULL, any_duplicated_linter())

expect_lint("y <- duplicated(z)", NULL, any_duplicated_linter())

# extended usage of any is not covered
expect_lint("any(duplicated(y), b)", NULL, any_duplicated_linter())
expect_lint("any(b, duplicated(y))", NULL, any_duplicated_linter())
})

test_that("any_duplicated_linter blocks simple disallowed usages", {
expect_lint(
"any(duplicated(x))",
rex::rex("anyDuplicated(x, ...) > 0 is better"),
any_duplicated_linter()
)

expect_lint(
"any(duplicated(foo(x)))",
rex::rex("anyDuplicated(x, ...) > 0 is better"),
any_duplicated_linter()
)

# na.rm doesn't really matter for this since duplicated can't return NA
expect_lint(
"any(duplicated(x), na.rm = TRUE)",
rex::rex("anyDuplicated(x, ...) > 0 is better"),
any_duplicated_linter()
)

# also catch nested usage
expect_lint(
"foo(any(duplicated(x)))",
rex::rex("anyDuplicated(x, ...) > 0 is better"),
any_duplicated_linter()
)
})

test_that("any_duplicated_linter catches length(unique()) equivalencies too", {
# non-matches
## different variable
expect_lint("length(unique(x)) == length(y)", NULL, any_duplicated_linter())
## different table
expect_lint("length(unique(DF$x)) == nrow(DT)", NULL, any_duplicated_linter())
expect_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", NULL, any_duplicated_linter())

# lintable usage
expect_lint(
"length(unique(x)) == length(x)",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)
# argument order doesn't matter
expect_lint(
"length(x) == length(unique(x))",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)
# nrow-style equivalency
expect_lint(
"nrow(DF) == length(unique(DF$col))",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)
expect_lint(
"nrow(DF) == length(unique(DF[['col']]))",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)
# match with nesting too
expect_lint(
"nrow(l$DF) == length(unique(l$DF[['col']]))",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)

# !=, <, and > usages are all alternative ways of writing a test for dupes
# technically, the direction of > / < matter, but writing
# length(unique(x)) > length(x) doesn't seem like it would ever happen.
expect_lint(
"length(unique(x)) != length(x)",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)
expect_lint(
"length(unique(x)) < length(x)",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)
expect_lint(
"length(x) > length(unique(x))",
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
any_duplicated_linter()
)

# TODO(michaelchirico): try and match data.table- and dplyr-specific versions of
# this, e.g. DT[, length(unique(col)) == .N] or
# DT %>% filter(length(unique(col)) == n())
})

test_that("any_duplicated_linter catches expression with two types of lint", {
expect_lint(
"table(any(duplicated(x)), length(unique(DF$col)) == nrow(DF))",
list(rex::rex("anyDuplicated(x, ...) > 0 is better"), rex::rex("anyDuplicated(x) == 0L is better")),
any_duplicated_linter()
)
})