Skip to content

Commit a5b53fb

Browse files
robsdedudentBre
authored andcommitted
[ruff] Offer fixes for RUF039 in more cases (#19065)
## Summary Expand cases in which ruff can offer a fix for `RUF039` (some of which are unsafe). While turning `"\n"` (== `\n`) into `r"\n"` (== `\\n`) is not equivalent at run-time, it's still functionally equivalent to do so in the context of [regex patterns](https://docs.python.org/3/library/re.html#regular-expression-syntax) as they themselves interpret the escape sequence. Therefore, an unsafe fix can be offered. Further, this PR also makes ruff offer fixes for byte string literals, not only strings literals as before. ## Test Plan Tests for all escape sequences have been added. ## Related Closes: #16713 --------- Co-authored-by: Brent Westbrook <36778786+ntBre@users.noreply.github.com>
1 parent 4fa6cc6 commit a5b53fb

9 files changed

+649
-27
lines changed

crates/ruff_linter/resources/test/fixtures/ruff/RUF039.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,16 @@
5353
regex.splititer(both, non_literal)
5454
regex.subf(f, lambda _: r'means', '"format"')
5555
regex.subfn(fn, f'''a$1n't''', lambda: "'function'")
56+
57+
58+
# https://github.com/astral-sh/ruff/issues/16713
59+
re.compile("\a\f\n\r\t\u27F2\U0001F0A1\v\x41") # with unsafe fix
60+
re.compile("\b") # without fix
61+
re.compile("\"") # without fix
62+
re.compile("\'") # without fix
63+
re.compile('\"') # without fix
64+
re.compile('\'') # without fix
65+
re.compile("\\") # without fix
66+
re.compile("\101") # without fix
67+
re.compile("a\
68+
b") # without fix

crates/ruff_linter/resources/test/fixtures/ruff/RUF039_concat.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,20 @@
9191
br''br""br''
9292
)
9393
regex.subfn(br'I\s\nee*d\s[O0o]me\x20\Qoffe\E, ' br'b')
94+
95+
96+
# https://github.com/astral-sh/ruff/issues/16713
97+
re.compile(
98+
"["
99+
"\U0001F600-\U0001F64F" # emoticons
100+
"\U0001F300-\U0001F5FF" # symbols & pictographs
101+
"\U0001F680-\U0001F6FF" # transport & map symbols
102+
"\U0001F1E0-\U0001F1FF" # flags (iOS)
103+
"\U00002702-\U000027B0"
104+
"\U000024C2-\U0001F251"
105+
"\u200d" # zero width joiner
106+
"\u200c" # zero width non-joiner
107+
"\\u200c" # must not be escaped in a raw string
108+
"]+",
109+
flags=re.UNICODE,
110+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import re
2+
3+
re.compile("\N{Partial Differential}") # with unsafe fix if python target is 3.8 or higher, else without fix

crates/ruff_linter/src/rules/ruff/mod.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,44 @@ mod tests {
555555
Ok(())
556556
}
557557

558+
#[test_case(Rule::UnrawRePattern, Path::new("RUF039_py_version_sensitive.py"))]
559+
fn preview_rules_py37(rule_code: Rule, path: &Path) -> Result<()> {
560+
let snapshot = format!(
561+
"preview__py37__{}_{}",
562+
rule_code.noqa_code(),
563+
path.to_string_lossy()
564+
);
565+
let diagnostics = test_path(
566+
Path::new("ruff").join(path).as_path(),
567+
&settings::LinterSettings {
568+
preview: PreviewMode::Enabled,
569+
unresolved_target_version: PythonVersion::PY37.into(),
570+
..settings::LinterSettings::for_rule(rule_code)
571+
},
572+
)?;
573+
assert_diagnostics!(snapshot, diagnostics);
574+
Ok(())
575+
}
576+
577+
#[test_case(Rule::UnrawRePattern, Path::new("RUF039_py_version_sensitive.py"))]
578+
fn preview_rules_py38(rule_code: Rule, path: &Path) -> Result<()> {
579+
let snapshot = format!(
580+
"preview__py38__{}_{}",
581+
rule_code.noqa_code(),
582+
path.to_string_lossy()
583+
);
584+
let diagnostics = test_path(
585+
Path::new("ruff").join(path).as_path(),
586+
&settings::LinterSettings {
587+
preview: PreviewMode::Enabled,
588+
unresolved_target_version: PythonVersion::PY38.into(),
589+
..settings::LinterSettings::for_rule(rule_code)
590+
},
591+
)?;
592+
assert_diagnostics!(snapshot, diagnostics);
593+
Ok(())
594+
}
595+
558596
#[test_case(Rule::UsedDummyVariable, Path::new("RUF052.py"), r"^_+", 1)]
559597
#[test_case(Rule::UsedDummyVariable, Path::new("RUF052.py"), r"", 2)]
560598
fn custom_regexp_preset(

crates/ruff_linter/src/rules/ruff/rules/unraw_re_pattern.rs

Lines changed: 112 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
use std::fmt::{Display, Formatter};
22
use std::str::FromStr;
33

4+
use ruff_diagnostics::Applicability;
45
use ruff_macros::{ViolationMetadata, derive_message_formats};
56
use ruff_python_ast::{
6-
BytesLiteral, Expr, ExprBytesLiteral, ExprCall, ExprStringLiteral, StringLiteral,
7+
BytesLiteral, Expr, ExprBytesLiteral, ExprCall, ExprStringLiteral, PythonVersion, StringLiteral,
78
};
89
use ruff_python_semantic::{Modules, SemanticModel};
9-
10-
use ruff_text_size::Ranged;
10+
use ruff_text_size::{Ranged, TextRange};
1111

1212
use crate::checkers::ast::Checker;
1313
use crate::{Edit, Fix, FixAvailability, Violation};
@@ -24,6 +24,29 @@ use crate::{Edit, Fix, FixAvailability, Violation};
2424
/// Regular expressions should be written
2525
/// using raw strings to avoid double escaping.
2626
///
27+
/// ## Fix safety
28+
/// The fix is unsafe if the string/bytes literal contains an escape sequence because the fix alters
29+
/// the runtime value of the literal while retaining the regex semantics.
30+
///
31+
/// For example
32+
/// ```python
33+
/// # Literal is `1\n2`.
34+
/// re.compile("1\n2")
35+
///
36+
/// # Literal is `1\\n2`, but the regex library will interpret `\\n` and will still match a newline
37+
/// # character as before.
38+
/// re.compile(r"1\n2")
39+
/// ```
40+
///
41+
/// ## Fix availability
42+
/// A fix is not available if either
43+
/// * the argument is a string with a (no-op) `u` prefix (e.g., `u"foo"`) as the prefix is
44+
/// incompatible with the raw prefix `r`
45+
/// * the argument is a string or bytes literal with an escape sequence that has a different
46+
/// meaning in the context of a regular expression such as `\b`, which is word boundary or
47+
/// backspace in a regex, depending on the context, but always a backspace in string and bytes
48+
/// literals.
49+
///
2750
/// ## Example
2851
///
2952
/// ```python
@@ -163,20 +186,44 @@ fn check_string(checker: &Checker, literal: &StringLiteral, module: RegexModule,
163186
let range = literal.range;
164187
let mut diagnostic = checker.report_diagnostic(UnrawRePattern { module, func, kind }, range);
165188

166-
if
167-
// The (no-op) `u` prefix is a syntax error when combined with `r`
168-
!literal.flags.prefix().is_unicode()
169-
// We are looking for backslash characters
170-
// in the raw source code here, because `\n`
171-
// gets converted to a single character already
172-
// at the lexing stage.
173-
&&!checker.locator().slice(literal.range()).contains('\\')
174-
{
175-
diagnostic.set_fix(Fix::safe_edit(Edit::insertion(
176-
"r".to_string(),
177-
literal.range().start(),
178-
)));
189+
let Some(applicability) = raw_string_applicability(checker, literal) else {
190+
return;
191+
};
192+
193+
diagnostic.set_fix(Fix::applicable_edit(
194+
Edit::insertion("r".to_string(), literal.range().start()),
195+
applicability,
196+
));
197+
}
198+
199+
/// Check how safe it is to prepend the `r` prefix to the string.
200+
///
201+
/// ## Returns
202+
/// * `None` if the prefix cannot be added,
203+
/// * `Some(a)` if it can be added with applicability `a`.
204+
fn raw_string_applicability(checker: &Checker, literal: &StringLiteral) -> Option<Applicability> {
205+
if literal.flags.prefix().is_unicode() {
206+
// The (no-op) `u` prefix is a syntax error when combined with `r`
207+
return None;
179208
}
209+
210+
if checker.target_version() >= PythonVersion::PY38 {
211+
raw_applicability(checker, literal.range(), |escaped| {
212+
matches!(
213+
escaped,
214+
Some('a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x' | 'N')
215+
)
216+
})
217+
} else {
218+
raw_applicability(checker, literal.range(), |escaped| {
219+
matches!(
220+
escaped,
221+
Some('a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x')
222+
)
223+
})
224+
}
225+
226+
// re.compile("\a\f\n\N{Partial Differential}\r\t\u27F2\U0001F0A1\v\x41") # with unsafe fix
180227
}
181228

182229
fn check_bytes(checker: &Checker, literal: &BytesLiteral, module: RegexModule, func: &str) {
@@ -187,5 +234,53 @@ fn check_bytes(checker: &Checker, literal: &BytesLiteral, module: RegexModule, f
187234
let kind = PatternKind::Bytes;
188235
let func = func.to_string();
189236
let range = literal.range;
190-
checker.report_diagnostic(UnrawRePattern { module, func, kind }, range);
237+
let mut diagnostic = checker.report_diagnostic(UnrawRePattern { module, func, kind }, range);
238+
239+
let Some(applicability) = raw_byte_applicability(checker, literal) else {
240+
return;
241+
};
242+
243+
diagnostic.set_fix(Fix::applicable_edit(
244+
Edit::insertion("r".to_string(), literal.range().start()),
245+
applicability,
246+
));
247+
}
248+
249+
/// Check how same it is to prepend the `r` prefix to the byte sting.
250+
///
251+
/// ## Returns
252+
/// * `None` if the prefix cannot be added,
253+
/// * `Some(a)` if it can be added with applicability `a`.
254+
fn raw_byte_applicability(checker: &Checker, literal: &BytesLiteral) -> Option<Applicability> {
255+
raw_applicability(checker, literal.range(), |escaped| {
256+
matches!(escaped, Some('a' | 'f' | 'n' | 'r' | 't' | 'v' | 'x'))
257+
})
258+
}
259+
260+
fn raw_applicability(
261+
checker: &Checker,
262+
literal_range: TextRange,
263+
match_allowed_escape_sequence: impl Fn(Option<char>) -> bool,
264+
) -> Option<Applicability> {
265+
let mut found_slash = false;
266+
let mut chars = checker.locator().slice(literal_range).chars().peekable();
267+
while let Some(char) = chars.next() {
268+
if char == '\\' {
269+
found_slash = true;
270+
// Turning `"\uXXXX"` into `r"\uXXXX"` is behaviorally equivalent when passed
271+
// to `re`, however, it's not exactly the same runtime value.
272+
// Similarly, for the other escape sequences.
273+
if !match_allowed_escape_sequence(chars.peek().copied()) {
274+
// If the next character is not one of the whitelisted ones, we likely cannot safely turn
275+
// this into a raw string.
276+
return None;
277+
}
278+
}
279+
}
280+
281+
Some(if found_slash {
282+
Applicability::Unsafe
283+
} else {
284+
Applicability::Safe
285+
})
191286
}

0 commit comments

Comments
 (0)