r-lib · MichaelChirico · Mar 28, 2022 · Mar 28, 2022 · Mar 28, 2022 · Mar 28, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -112,6 +112,7 @@ Collate:
     'path_linters.R'
     'pipe_call_linter.R'
     'pipe_continuation_linter.R'
+    'regex_subset_linter.R'
     'semicolon_terminator_linter.R'
     'seq_linter.R'
     'settings.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -76,6 +76,7 @@ export(paren_brace_linter)
 export(paste_sep_linter)
 export(pipe_call_linter)
 export(pipe_continuation_linter)
+export(regex_subset_linter)
 export(semicolon_terminator_linter)
 export(seq_linter)
 export(single_quotes_linter)

diff --git a/NEWS.md b/NEWS.md
@@ -107,6 +107,7 @@ function calls. (#850, #851, @renkun-ken)
    * `paste_sep_linter()` Require usage of `paste0()` over `paste(sep = "")`
    * `nested_ifelse_linter()` Prevent nested calls to `ifelse()` like `ifelse(A, x, ifelse(B, y, z))`, and similar
    * `unreachable_code_linter()` Prevent code after `return()` and `stop()` statements that will never be reached
+   * `regex_subset_linter()` Require usage of `grep(ptn, x, value = TRUE)` over `x[grep(ptn, x)]` and similar
    * `consecutive_stopifnot_linter()` Require consecutive calls to `stopifnot()` to be unified into one
 * `assignment_linter()` now lints right assignment (`->` and `->>`) and gains two arguments. `allow_cascading_assign` (`TRUE` by default) toggles whether to lint `<<-` and `->>`; `allow_right_assign` toggles whether to lint `->` and `->>` (#915, @michaelchirico)
 * `infix_spaces_linter()` gains argument `exclude_operators` to disable lints on selected infix operators. By default, all "low-precedence" operators throw lints; see `?infix_spaces_linter` for an enumeration of these. (#914 @michaelchirico)

diff --git a/R/regex_subset_linter.R b/R/regex_subset_linter.R
@@ -0,0 +1,87 @@
+#' Require usage of direct methods for subsetting strings via regex.
+#'
+#' Using `value = TRUE` in [grep()] returns the subset of the input that matches
+#'   the pattern, e.g. `grep("[a-m]", letters, value = TRUE)` will return the
+#'   first 13 elements (`a` through `m`).
+#'
+#' `letters[grep("[a-m]", letters)]` and `letters[grepl("[a-m]", letters)]`
+#'   both return the same thing, but more circuitously and more verbosely.
+#'
+#' The `stringr` package also provides an even more readable alternative,
+#'   namely `str_subset()`, which should be preferred to versions using
+#'   `str_detect()` and `str_which()`.
+#'
+#' @section Exceptions:
+#'   Note that `x[grep(pattern, x)]` and `grep(pattern, x, value = TRUE)`
+#'   are not _completely_ interchangeable when `x` is not character
+#'   (most commonly, when `x` is a factor), because the output of the
+#'   latter will be a character vector while the former remains a factor.
+#'   It still may be preferable to refactor such code, as it may be faster
+#'   to match the pattern on `levels(x)` and use that to subset instead.
+#'
+#' @evalRd rd_tags("regex_subset_linter")
+#' @seealso [linters] for a complete list of linters available in lintr.
+#' @export
+regex_subset_linter <- function() {
+  Linter(function(source_file) {
+    if (length(source_file$xml_parsed_content) == 0L) {
+      return(list())
+    }
+
+    xml <- source_file$xml_parsed_content
+
+    parent_expr_cond <- xp_and(
+      "OP-LEFT-BRACKET",
+      # parent::expr for LEFT_ASSIGN and RIGHT_ASSIGN, but, strangely,
+      #   parent::equal_assign for EQ_ASSIGN. So just use * as a catchall.
+      "not(parent::*[LEFT_ASSIGN or EQ_ASSIGN or RIGHT_ASSIGN])"
+    )
+    # See https://www.w3.org/TR/1999/REC-xpath-19991116/#booleans;
+    #   equality of nodes is based on the string value of the nodes, which
+    #   is basically what we need, i.e., whatever expression comes in
+    #   <expr>[grepl(pattern, <expr>)] matches exactly, e.g. names(x)[grepl(ptn, names(x))].
+    subset_cond_fmt <- xp_and(
+      "expr[SYMBOL_FUNCTION_CALL[%s]]",
+      "expr[position() = %d] = parent::expr/expr[1]"
+    )
+    grep_xpath <- sprintf(
+      "//expr[%s]/expr[%s]",
+      parent_expr_cond,
+      sprintf(subset_cond_fmt, xp_text_in_table(c("grep", "grepl")), 3L)
+    )
+
+    grep_expr <- xml2::xml_find_all(xml, grep_xpath)
+
+    grep_lints <- lapply(
+      grep_expr,
+      xml_nodes_to_lint,
+      source_file = source_file,
+      lint_message = paste(
+        "Prefer grep(pattern, x, ..., value = TRUE) over",
+        "x[grep(pattern, x, ...)] and x[grepl(pattern, x, ...)]."
+      ),
+      type = "warning"
+    )
+
+    stringr_xpath <- sprintf(
+      "//expr[%s]/expr[%s]",
+      parent_expr_cond,
+      sprintf(subset_cond_fmt, xp_text_in_table(c("str_detect", "str_which")), 2L)
+    )
+
+    stringr_expr <- xml2::xml_find_all(xml, stringr_xpath)
+
+    stringr_lints <- lapply(
+      stringr_expr,
+      xml_nodes_to_lint,
+      source_file = source_file,
+      lint_message = paste(
+        "Prefer stringr::str_subset(x, pattern) over",
+        "x[str_detect(x, pattern)] and x[str_which(x, pattern)]."
+      ),
+      type = "warning"
+    )
+
+    return(c(grep_lints, stringr_lints))
+  })
+}
diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv
@@ -49,6 +49,7 @@ paren_brace_linter,style readability default
 paste_sep_linter,best_practices consistency
 pipe_call_linter,style readability
 pipe_continuation_linter,style readability default
+regex_subset_linter,best_practices efficiency
 semicolon_terminator_linter,style readability default configurable
 seq_linter,robustness efficiency consistency best_practices default
 single_quotes_linter,style consistency readability default

diff --git a/man/best_practices_linters.Rd b/man/best_practices_linters.Rd
diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd
diff --git a/man/linters.Rd b/man/linters.Rd
diff --git a/man/regex_subset_linter.Rd b/man/regex_subset_linter.Rd
diff --git a/tests/testthat/test-regex_subset_linter.R b/tests/testthat/test-regex_subset_linter.R
@@ -0,0 +1,50 @@
+test_that("regex_subset_linter skips allowed usages", {
+  expect_lint("y[grepl(ptn, x)]", NULL, regex_subset_linter())
+  expect_lint("x[grepl(ptn, foo(x))]", NULL, regex_subset_linter())
+})
+
+test_that("regex_subset_linter blocks simple disallowed usages", {
+  expect_lint(
+    "x[grep(ptn, x)]",
+    rex::rex("Prefer grep(pattern, x, ..., value = TRUE)"),
+    regex_subset_linter()
+  )
+
+  expect_lint(
+    "names(y)[grepl(ptn, names(y), perl = TRUE)]",
+    rex::rex("Prefer grep(pattern, x, ..., value = TRUE)"),
+    regex_subset_linter()
+  )
+
+  expect_lint(
+    "names(foo(y))[grepl(ptn, names(foo(y)), fixed = TRUE)]",
+    rex::rex("Prefer grep(pattern, x, ..., value = TRUE)"),
+    regex_subset_linter()
+  )
+})
+
+test_that("regex_subset_linter skips grep/grepl subassignment", {
+  expect_lint("x[grep(ptn, x)] <- ''", NULL, regex_subset_linter())
+  expect_lint("x[grepl(ptn, x)] <- ''", NULL, regex_subset_linter())
+  expect_lint("x[grep(ptn, x, perl = TRUE)] = ''", NULL, regex_subset_linter())
+  expect_lint("'' -> x[grep(ptn, x, ignore.case = TRUE)] = ''", NULL, regex_subset_linter())
+})
+
+test_that("regex_subset_linter works for stringr equivalents", {
+  expect_lint("y[str_detect(x, ptn)]", NULL, regex_subset_linter())
+  expect_lint("x[str_detect(foo(x), ptn)]", NULL, regex_subset_linter())
+
+  expect_lint(
+    "x[str_which(x, ptn)]",
+    rex::rex("Prefer stringr::str_subset(x, pattern) over"),
+    regex_subset_linter()
+  )
+
+  expect_lint(
+    "names(y)[str_detect(names(y), ptn, negate = TRUE)]",
+    rex::rex("Prefer stringr::str_subset(x, pattern) over"),
+    regex_subset_linter()
+  )
+  expect_lint("x[str_detect(x, ptn)] <- ''", NULL, regex_subset_linter())
+  expect_lint("x[str_detect(x, ptn)] <- ''", NULL, regex_subset_linter())
+})