Skip to content

Commit 727ec1a

Browse files
New any_duplicated_linter (#986)
* initial work importing any_duplicated_linter * refactor xpath formulation, simplify tests * customize lint message * xml_parsed_content
1 parent 863de33 commit 727ec1a

File tree

9 files changed

+256
-3
lines changed

9 files changed

+256
-3
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Collate:
4949
'aaa.R'
5050
'actions.R'
5151
'addins.R'
52+
'any_duplicated_linter.R'
5253
'any_is_na_linter.R'
5354
'assignment_linter.R'
5455
'backport_linter.R'

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export(T_and_F_symbol_linter)
1313
export(absolute_path_linter)
1414
export(all_undesirable_functions)
1515
export(all_undesirable_operators)
16+
export(any_duplicated_linter)
1617
export(any_is_na_linter)
1718
export(assignment_linter)
1819
export(available_linters)

R/any_duplicated_linter.R

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#' Require usage of anyDuplicated() > 0 over any(duplicated(.))
2+
#'
3+
#' [anyDuplicated()] exists as a replacement for `any(duplicated(.))` which is
4+
#' more efficient for simple objects, and in the worst case is the same
5+
#' efficiency. Therefore it should be used in all situations instead of the
6+
#' latter.
7+
#'
8+
#' Also match usage like `length(unique(x$col)) == nrow(x)`, which can
9+
#' be replaced by `anyDuplicated(x$col) == 0L`.
10+
#'
11+
#' @evalRd rd_tags("any_duplicated_linter")
12+
#' @seealso [linters] for a complete list of linters available in lintr.
13+
#' @export
14+
any_duplicated_linter <- function() {
15+
Linter(function(source_file) {
16+
if (length(source_file$xml_parsed_content) == 0L) {
17+
return(list())
18+
}
19+
20+
xml <- source_file$xml_parsed_content
21+
22+
any_duplicated_xpath <- "//expr[
23+
expr[SYMBOL_FUNCTION_CALL[text() = 'any']]
24+
and expr[expr[SYMBOL_FUNCTION_CALL[text() = 'duplicated']]]
25+
and (
26+
not(OP-COMMA)
27+
or OP-COMMA[
28+
not(preceding-sibling::OP-COMMA)
29+
and following-sibling::SYMBOL_SUB[1][text() = 'na.rm']
30+
]
31+
)
32+
]"
33+
34+
any_duplicated_expr <- xml2::xml_find_all(xml, any_duplicated_xpath)
35+
any_duplicated_lints <- lapply(
36+
any_duplicated_expr,
37+
xml_nodes_to_lint,
38+
source_file = source_file,
39+
lint_message = "anyDuplicated(x, ...) > 0 is better than any(duplicated(x), ...).",
40+
type = "warning"
41+
)
42+
43+
# path from the expr of the unique() call to the call that needs to match.
44+
# the final parent::expr/expr gets us to the expr on the other side of EQ;
45+
# this lets us match on either side of EQ, where following-sibling
46+
# assumes we are before EQ, preceding-sibling assumes we are after EQ.
47+
path_to_neighbor_call_expr_fmt <- file.path(
48+
"parent::expr",
49+
"parent::expr",
50+
"parent::expr",
51+
"expr",
52+
"expr[SYMBOL_FUNCTION_CALL[text()= '%s']]",
53+
"following-sibling::expr"
54+
)
55+
unique_expr_xpath <- xp_and(
56+
"SYMBOL_FUNCTION_CALL[text() = 'unique']",
57+
# ensure the expr matches to avoid spurious match like
58+
# length(unique(x)) == length(y)
59+
xp_or(
60+
# length(unique(x)) == length(x).
61+
sprintf(
62+
"following-sibling::expr = %s",
63+
sprintf(path_to_neighbor_call_expr_fmt, "length")
64+
),
65+
# length(unique( << DF$col or DF[["col"]] >> )) == nrow(DF)
66+
sprintf(
67+
"following-sibling::expr[OP-DOLLAR or LBB]/expr[1] = %s",
68+
sprintf(path_to_neighbor_call_expr_fmt, "nrow")
69+
)
70+
)
71+
)
72+
length_unique_call_xpath <- xp_and(
73+
"expr[SYMBOL_FUNCTION_CALL[text() = 'length']]",
74+
sprintf("expr[expr[%s]]", unique_expr_xpath)
75+
)
76+
# EQ ensures we're in an ==, !=, <, or > clause
77+
length_unique_xpath <-
78+
sprintf("//expr[EQ or NE or GT or LT]/expr[%s]", length_unique_call_xpath)
79+
length_unique_xpath <- "
80+
//expr[EQ or NE or GT or LT]
81+
/expr[
82+
expr[SYMBOL_FUNCTION_CALL[text() = 'length']]
83+
and expr[expr[
84+
SYMBOL_FUNCTION_CALL[text() = 'unique']
85+
and (
86+
following-sibling::expr =
87+
parent::expr
88+
/parent::expr
89+
/parent::expr
90+
/expr
91+
/expr[SYMBOL_FUNCTION_CALL[text()= 'length']]
92+
/following-sibling::expr
93+
or
94+
following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
95+
parent::expr
96+
/parent::expr
97+
/parent::expr
98+
/expr
99+
/expr[SYMBOL_FUNCTION_CALL[text()= 'nrow']]
100+
/following-sibling::expr
101+
)
102+
]]
103+
]"
104+
length_unique_expr <- xml2::xml_find_all(xml, length_unique_xpath)
105+
length_unique_lints <- lapply(
106+
length_unique_expr,
107+
xml_nodes_to_lint,
108+
source_file = source_file,
109+
lint_message =
110+
"anyDuplicated(x) == 0L is better than length(unique(x)) == length(x) and length(unique(DF$col)) == nrow(DF)",
111+
type = "warning"
112+
)
113+
114+
return(c(any_duplicated_lints, length_unique_lints))
115+
})
116+
}

inst/lintr/linters.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
linter,tags
22
absolute_path_linter,robustness best_practices configurable
3+
any_duplicated_linter,efficiency best_practices
34
any_is_na_linter,efficiency best_practices
45
assignment_linter,style consistency default
56
backport_linter,robustness configurable package_development

man/any_duplicated_linter.Rd

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/best_practices_linters.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/efficiency_linters.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/linters.Rd

Lines changed: 4 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
test_that("any_duplicated_linter skips allowed usages", {
2+
expect_lint("x <- any(y)", NULL, any_duplicated_linter())
3+
4+
expect_lint("y <- duplicated(z)", NULL, any_duplicated_linter())
5+
6+
# extended usage of any is not covered
7+
expect_lint("any(duplicated(y), b)", NULL, any_duplicated_linter())
8+
expect_lint("any(b, duplicated(y))", NULL, any_duplicated_linter())
9+
})
10+
11+
test_that("any_duplicated_linter blocks simple disallowed usages", {
12+
expect_lint(
13+
"any(duplicated(x))",
14+
rex::rex("anyDuplicated(x, ...) > 0 is better"),
15+
any_duplicated_linter()
16+
)
17+
18+
expect_lint(
19+
"any(duplicated(foo(x)))",
20+
rex::rex("anyDuplicated(x, ...) > 0 is better"),
21+
any_duplicated_linter()
22+
)
23+
24+
# na.rm doesn't really matter for this since duplicated can't return NA
25+
expect_lint(
26+
"any(duplicated(x), na.rm = TRUE)",
27+
rex::rex("anyDuplicated(x, ...) > 0 is better"),
28+
any_duplicated_linter()
29+
)
30+
31+
# also catch nested usage
32+
expect_lint(
33+
"foo(any(duplicated(x)))",
34+
rex::rex("anyDuplicated(x, ...) > 0 is better"),
35+
any_duplicated_linter()
36+
)
37+
})
38+
39+
test_that("any_duplicated_linter catches length(unique()) equivalencies too", {
40+
# non-matches
41+
## different variable
42+
expect_lint("length(unique(x)) == length(y)", NULL, any_duplicated_linter())
43+
## different table
44+
expect_lint("length(unique(DF$x)) == nrow(DT)", NULL, any_duplicated_linter())
45+
expect_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", NULL, any_duplicated_linter())
46+
47+
# lintable usage
48+
expect_lint(
49+
"length(unique(x)) == length(x)",
50+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
51+
any_duplicated_linter()
52+
)
53+
# argument order doesn't matter
54+
expect_lint(
55+
"length(x) == length(unique(x))",
56+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
57+
any_duplicated_linter()
58+
)
59+
# nrow-style equivalency
60+
expect_lint(
61+
"nrow(DF) == length(unique(DF$col))",
62+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
63+
any_duplicated_linter()
64+
)
65+
expect_lint(
66+
"nrow(DF) == length(unique(DF[['col']]))",
67+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
68+
any_duplicated_linter()
69+
)
70+
# match with nesting too
71+
expect_lint(
72+
"nrow(l$DF) == length(unique(l$DF[['col']]))",
73+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
74+
any_duplicated_linter()
75+
)
76+
77+
# !=, <, and > usages are all alternative ways of writing a test for dupes
78+
# technically, the direction of > / < matter, but writing
79+
# length(unique(x)) > length(x) doesn't seem like it would ever happen.
80+
expect_lint(
81+
"length(unique(x)) != length(x)",
82+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
83+
any_duplicated_linter()
84+
)
85+
expect_lint(
86+
"length(unique(x)) < length(x)",
87+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
88+
any_duplicated_linter()
89+
)
90+
expect_lint(
91+
"length(x) > length(unique(x))",
92+
rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
93+
any_duplicated_linter()
94+
)
95+
96+
# TODO(michaelchirico): try and match data.table- and dplyr-specific versions of
97+
# this, e.g. DT[, length(unique(col)) == .N] or
98+
# DT %>% filter(length(unique(col)) == n())
99+
})
100+
101+
test_that("any_duplicated_linter catches expression with two types of lint", {
102+
expect_lint(
103+
"table(any(duplicated(x)), length(unique(DF$col)) == nrow(DF))",
104+
list(rex::rex("anyDuplicated(x, ...) > 0 is better"), rex::rex("anyDuplicated(x) == 0L is better")),
105+
any_duplicated_linter()
106+
)
107+
})

0 commit comments

Comments
 (0)