r-lib · MichaelChirico · Mar 26, 2022 · Mar 23, 2022 · Mar 23, 2022 · Mar 23, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -49,6 +49,7 @@ Collate:
     'aaa.R'
     'actions.R'
     'addins.R'
+    'any_duplicated_linter.R'
     'any_is_na_linter.R'
     'assignment_linter.R'
     'backport_linter.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(T_and_F_symbol_linter)
 export(absolute_path_linter)
 export(all_undesirable_functions)
 export(all_undesirable_operators)
+export(any_duplicated_linter)
 export(any_is_na_linter)
 export(assignment_linter)
 export(available_linters)

diff --git a/R/any_duplicated_linter.R b/R/any_duplicated_linter.R
@@ -0,0 +1,116 @@
+#' Require usage of anyDuplicated() > 0 over any(duplicated(.))
+#'
+#' [anyDuplicated()] exists as a replacement for `any(duplicated(.))` which is
+#'   more efficient for simple objects, and in the worst case is the same
+#'   efficiency. Therefore it should be used in all situations instead of the
+#'   latter.
+#'
+#' Also match usage like `length(unique(x$col)) == nrow(x)`, which can
+#'   be replaced by `anyDuplicated(x$col) == 0L`.
+#'
+#' @evalRd rd_tags("any_duplicated_linter")
+#' @seealso [linters] for a complete list of linters available in lintr.
+#' @export
+any_duplicated_linter <- function() {
+  Linter(function(source_file) {
+    if (length(source_file$xml_parsed_content) == 0L) {
+      return(list())
+    }
+
+    xml <- source_file$xml_parsed_content
+
+    any_duplicated_xpath <- "//expr[
+      expr[SYMBOL_FUNCTION_CALL[text() = 'any']]
+      and expr[expr[SYMBOL_FUNCTION_CALL[text() = 'duplicated']]]
+      and (
+        not(OP-COMMA)
+        or OP-COMMA[
+          not(preceding-sibling::OP-COMMA)
+          and following-sibling::SYMBOL_SUB[1][text() = 'na.rm']
+        ]
+      )
+    ]"
+
+    any_duplicated_expr <- xml2::xml_find_all(xml, any_duplicated_xpath)
+    any_duplicated_lints <- lapply(
+      any_duplicated_expr,
+      xml_nodes_to_lint,
+      source_file = source_file,
+      lint_message = "anyDuplicated(x, ...) > 0 is better than any(duplicated(x), ...).",
+      type = "warning"
+    )
+
+    # path from the expr of the unique() call to the call that needs to match.
+    #  the final parent::expr/expr gets us to the expr on the other side of EQ;
+    #  this lets us match on either side of EQ, where following-sibling
+    #  assumes we are before EQ, preceding-sibling assumes we are after EQ.
+    path_to_neighbor_call_expr_fmt <- file.path(
+      "parent::expr",
+      "parent::expr",
+      "parent::expr",
+      "expr",
+      "expr[SYMBOL_FUNCTION_CALL[text()= '%s']]",
+      "following-sibling::expr"
+    )
+    unique_expr_xpath <- xp_and(
+      "SYMBOL_FUNCTION_CALL[text() = 'unique']",
+      # ensure the expr matches to avoid spurious match like
+      #   length(unique(x)) == length(y)
+      xp_or(
+        # length(unique(x)) == length(x).
+        sprintf(
+          "following-sibling::expr = %s",
+          sprintf(path_to_neighbor_call_expr_fmt, "length")
+        ),
+        # length(unique( << DF$col or DF[["col"]] >> )) == nrow(DF)
+        sprintf(
+          "following-sibling::expr[OP-DOLLAR or LBB]/expr[1] = %s",
+          sprintf(path_to_neighbor_call_expr_fmt, "nrow")
+        )
+      )
+    )
+    length_unique_call_xpath <- xp_and(
+      "expr[SYMBOL_FUNCTION_CALL[text() = 'length']]",
+      sprintf("expr[expr[%s]]", unique_expr_xpath)
+    )
+    # EQ ensures we're in an ==, !=, <, or > clause
+    length_unique_xpath <-
+      sprintf("//expr[EQ or NE or GT or LT]/expr[%s]", length_unique_call_xpath)
+    length_unique_xpath <- "
+    //expr[EQ or NE or GT or LT]
+    /expr[
+      expr[SYMBOL_FUNCTION_CALL[text() = 'length']]
+      and expr[expr[
+        SYMBOL_FUNCTION_CALL[text() = 'unique']
+        and (
+          following-sibling::expr =
+            parent::expr
+            /parent::expr
+            /parent::expr
+            /expr
+            /expr[SYMBOL_FUNCTION_CALL[text()= 'length']]
+            /following-sibling::expr
+          or
+          following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
+            parent::expr
+            /parent::expr
+            /parent::expr
+            /expr
+            /expr[SYMBOL_FUNCTION_CALL[text()= 'nrow']]
+            /following-sibling::expr
+        )
+      ]]
+    ]"
+    length_unique_expr <- xml2::xml_find_all(xml, length_unique_xpath)
+    length_unique_lints <- lapply(
+      length_unique_expr,
+      xml_nodes_to_lint,
+      source_file = source_file,
+      lint_message =
+        "anyDuplicated(x) == 0L is better than length(unique(x)) == length(x) and length(unique(DF$col)) == nrow(DF)",
+      type = "warning"
+    )
+
+    return(c(any_duplicated_lints, length_unique_lints))
+  })
+}
diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv
@@ -1,5 +1,6 @@
 linter,tags
 absolute_path_linter,robustness best_practices configurable
+any_duplicated_linter,efficiency best_practices
 any_is_na_linter,efficiency best_practices
 assignment_linter,style consistency default
 backport_linter,robustness configurable package_development

diff --git a/man/any_duplicated_linter.Rd b/man/any_duplicated_linter.Rd
diff --git a/man/best_practices_linters.Rd b/man/best_practices_linters.Rd
diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd
diff --git a/man/linters.Rd b/man/linters.Rd
diff --git a/tests/testthat/test-any_duplicated_linter.R b/tests/testthat/test-any_duplicated_linter.R
@@ -0,0 +1,107 @@
+test_that("any_duplicated_linter skips allowed usages", {
+  expect_lint("x <- any(y)", NULL, any_duplicated_linter())
+
+  expect_lint("y <- duplicated(z)", NULL, any_duplicated_linter())
+
+  # extended usage of any is not covered
+  expect_lint("any(duplicated(y), b)", NULL, any_duplicated_linter())
+  expect_lint("any(b, duplicated(y))", NULL, any_duplicated_linter())
+})
+
+test_that("any_duplicated_linter blocks simple disallowed usages", {
+  expect_lint(
+    "any(duplicated(x))",
+    rex::rex("anyDuplicated(x, ...) > 0 is better"),
+    any_duplicated_linter()
+  )
+
+  expect_lint(
+    "any(duplicated(foo(x)))",
+    rex::rex("anyDuplicated(x, ...) > 0 is better"),
+    any_duplicated_linter()
+  )
+
+  # na.rm doesn't really matter for this since duplicated can't return NA
+  expect_lint(
+    "any(duplicated(x), na.rm = TRUE)",
+    rex::rex("anyDuplicated(x, ...) > 0 is better"),
+    any_duplicated_linter()
+  )
+
+  # also catch nested usage
+  expect_lint(
+    "foo(any(duplicated(x)))",
+    rex::rex("anyDuplicated(x, ...) > 0 is better"),
+    any_duplicated_linter()
+  )
+})
+
+test_that("any_duplicated_linter catches length(unique()) equivalencies too", {
+  # non-matches
+  ## different variable
+  expect_lint("length(unique(x)) == length(y)", NULL, any_duplicated_linter())
+  ## different table
+  expect_lint("length(unique(DF$x)) == nrow(DT)", NULL, any_duplicated_linter())
+  expect_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", NULL, any_duplicated_linter())
+
+  # lintable usage
+  expect_lint(
+    "length(unique(x)) == length(x)",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+  # argument order doesn't matter
+  expect_lint(
+    "length(x) == length(unique(x))",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+  # nrow-style equivalency
+  expect_lint(
+    "nrow(DF) == length(unique(DF$col))",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+  expect_lint(
+    "nrow(DF) == length(unique(DF[['col']]))",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+  # match with nesting too
+  expect_lint(
+    "nrow(l$DF) == length(unique(l$DF[['col']]))",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+
+  # !=, <, and > usages are all alternative ways of writing a test for dupes
+  #   technically, the direction of > / < matter, but writing
+  #   length(unique(x)) > length(x) doesn't seem like it would ever happen.
+  expect_lint(
+    "length(unique(x)) != length(x)",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+  expect_lint(
+    "length(unique(x)) < length(x)",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+  expect_lint(
+    "length(x) > length(unique(x))",
+    rex::rex("anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)"),
+    any_duplicated_linter()
+  )
+
+  # TODO(michaelchirico): try and match data.table- and dplyr-specific versions of
+  #   this, e.g. DT[, length(unique(col)) == .N] or
+  #   DT %>% filter(length(unique(col)) == n())
+})
+
+test_that("any_duplicated_linter catches expression with two types of lint", {
+  expect_lint(
+    "table(any(duplicated(x)), length(unique(DF$col)) == nrow(DF))",
+    list(rex::rex("anyDuplicated(x, ...) > 0 is better"), rex::rex("anyDuplicated(x) == 0L is better")),
+    any_duplicated_linter()
+  )
+})