Add --ignore-multiline-regex option. (#3476)

julian-smith-artifex-com · web-flow · commit f3d85db4f187 · 2024-07-08T16:04:09.000-04:00
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
@@ -36,6 +36,7 @@
     Pattern,
     Sequence,
     Set,
+    TextIO,
     Tuple,
 )
 
@@ -201,11 +202,17 @@ def __str__(self) -> str:
 
 
 class FileOpener:
-    def __init__(self, use_chardet: bool, quiet_level: int) -> None:
+    def __init__(
+        self,
+        use_chardet: bool,
+        quiet_level: int,
+        ignore_multiline_regex: Optional[Pattern[str]],
+    ) -> None:
         self.use_chardet = use_chardet
         if use_chardet:
             self.init_chardet()
         self.quiet_level = quiet_level
+        self.ignore_multiline_regex = ignore_multiline_regex
 
     def init_chardet(self) -> None:
         try:
@@ -247,7 +254,7 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
             )
             raise
         else:
-            lines = f.readlines()
+            lines = self.get_lines(f)
             f.close()
 
         return lines, f.encoding
@@ -262,7 +269,7 @@ def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
                 print(f'WARNING: Trying next encoding "{encoding}"', file=sys.stderr)
             with open(filename, encoding=encoding, newline="") as f:
                 try:
-                    lines = f.readlines()
+                    lines = self.get_lines(f)
                 except UnicodeDecodeError:
                     if not self.quiet_level & QuietLevels.ENCODING:
                         print(
@@ -279,6 +286,22 @@ def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
 
         return lines, encoding
 
+    def get_lines(self, f: TextIO) -> List[str]:
+        if self.ignore_multiline_regex:
+            text = f.read()
+            pos = 0
+            text2 = ""
+            for m in re.finditer(self.ignore_multiline_regex, text):
+                text2 += text[pos : m.start()]
+                # Replace with blank lines so line numbers are unchanged.
+                text2 += "\n" * m.group().count("\n")
+                pos = m.end()
+            text2 += text[pos:]
+            lines = text2.split("\n")
+        else:
+            lines = f.readlines()
+        return lines
+
 
 # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
 
@@ -411,6 +434,19 @@ def parse_options(
         'e.g., "\\bmatch\\b". Defaults to '
         "empty/disabled.",
     )
+    parser.add_argument(
+        "--ignore-multiline-regex",
+        action="store",
+        type=str,
+        help="regular expression that is used to ignore "
+        "text that may span multi-line regions. "
+        "The regex is run with re.DOTALL. For example to "
+        "allow skipping of regions of Python code using "
+        "begin/end comments one could use: "
+        "--ignore-multiline-regex "
+        "'# codespell:ignore-begin *\\n.*# codespell:ignore-end *\\n'. "
+        "Defaults to empty/disabled.",
+    )
     parser.add_argument(
         "-I",
         "--ignore-words",
@@ -1115,6 +1151,20 @@ def main(*args: str) -> int:
     else:
         ignore_word_regex = None
 
+    if options.ignore_multiline_regex:
+        try:
+            ignore_multiline_regex = re.compile(
+                options.ignore_multiline_regex, re.DOTALL
+            )
+        except re.error as e:
+            return _usage_error(
+                parser,
+                f"ERROR: invalid --ignore-multiline-regex "
+                f'"{options.ignore_multiline_regex}" ({e})',
+            )
+    else:
+        ignore_multiline_regex = None
+
     ignore_words, ignore_words_cased = parse_ignore_words_option(
         options.ignore_words_list
     )
@@ -1203,7 +1253,11 @@ def main(*args: str) -> int:
         for exclude_file in exclude_files:
             build_exclude_hashes(exclude_file, exclude_lines)
 
-    file_opener = FileOpener(options.hard_encoding_detection, options.quiet_level)
+    file_opener = FileOpener(
+        options.hard_encoding_detection,
+        options.quiet_level,
+        ignore_multiline_regex,
+    )
 
     glob_match = GlobMatch(
         flatten_clean_comma_separated_arguments(options.skip) if options.skip else []
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
@@ -942,6 +942,43 @@ def test_ignore_regex_option(
     assert cs.main(fname, r"--ignore-regex=\bdonn\b") == 1
 
 
+def test_ignore_multiline_regex_option(
+    tmp_path: Path,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """Test ignore regex option functionality."""
+
+    # Invalid regex.
+    result = cs.main("--ignore-multiline-regex=(", std=True)
+    assert isinstance(result, tuple)
+    code, stdout, _ = result
+    assert code == EX_USAGE
+    assert "usage:" in stdout
+
+    fname = tmp_path / "flag.txt"
+    fname.write_text(
+        """
+        Please see http://example.com/abandonned for info
+        # codespell:ignore-begin
+        '''
+        abandonned
+        abandonned
+        '''
+        # codespell:ignore-end
+        abandonned
+        """
+    )
+    assert cs.main(fname) == 4
+    assert (
+        cs.main(
+            fname,
+            "--ignore-multiline-regex",
+            "codespell:ignore-begin.*codespell:ignore-end",
+        )
+        == 2
+    )
+
+
 def test_uri_regex_option(
     tmp_path: Path,
     capsys: pytest.CaptureFixture[str],
diff --git a/pyproject.toml b/pyproject.toml
@@ -169,6 +169,6 @@ max-complexity = 45
 [tool.ruff.lint.pylint]
 allow-magic-value-types = ["bytes", "int", "str",]
 max-args = 13
-max-branches = 46
-max-returns = 11
+max-branches = 47
+max-returns = 12
 max-statements = 119