Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions crates/ruff_linter/resources/test/fixtures/ruff/RUF039.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,16 @@
regex.splititer(both, non_literal)
regex.subf(f, lambda _: r'means', '"format"')
regex.subfn(fn, f'''a$1n't''', lambda: "'function'")


# https://github.com/astral-sh/ruff/issues/16713
re.compile("\a\f\n\r\t\u27F2\U0001F0A1\v\x41") # with unsafe fix
re.compile("\b") # without fix
re.compile("\"") # without fix
re.compile("\'") # without fix
re.compile('\"') # without fix
re.compile('\'') # without fix
re.compile("\\") # without fix
re.compile("\101") # without fix
re.compile("a\
b") # without fix
17 changes: 17 additions & 0 deletions crates/ruff_linter/resources/test/fixtures/ruff/RUF039_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,20 @@
br''br""br''
)
regex.subfn(br'I\s\nee*d\s[O0o]me\x20\Qoffe\E, ' br'b')


# https://github.com/astral-sh/ruff/issues/16713
re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\u200d" # zero width joiner
"\u200c" # zero width non-joiner
"\\u200c" # must not be escaped in a raw string
"]+",
flags=re.UNICODE,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import re

re.compile("\N{Partial Differential}") # with unsafe fix if python target is 3.8 or higher, else without fix
38 changes: 38 additions & 0 deletions crates/ruff_linter/src/rules/ruff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,44 @@ mod tests {
Ok(())
}

#[test_case(Rule::UnrawRePattern, Path::new("RUF039_py_version_sensitive.py"))]
fn preview_rules_py37(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!(
"preview__py37__{}_{}",
rule_code.noqa_code(),
path.to_string_lossy()
);
let diagnostics = test_path(
Path::new("ruff").join(path).as_path(),
&settings::LinterSettings {
preview: PreviewMode::Enabled,
unresolved_target_version: PythonVersion::PY37.into(),
..settings::LinterSettings::for_rule(rule_code)
},
)?;
assert_diagnostics!(snapshot, diagnostics);
Ok(())
}

#[test_case(Rule::UnrawRePattern, Path::new("RUF039_py_version_sensitive.py"))]
fn preview_rules_py38(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!(
"preview__py38__{}_{}",
rule_code.noqa_code(),
path.to_string_lossy()
);
let diagnostics = test_path(
Path::new("ruff").join(path).as_path(),
&settings::LinterSettings {
preview: PreviewMode::Enabled,
unresolved_target_version: PythonVersion::PY38.into(),
..settings::LinterSettings::for_rule(rule_code)
},
)?;
assert_diagnostics!(snapshot, diagnostics);
Ok(())
}

#[test_case(Rule::UsedDummyVariable, Path::new("RUF052.py"), r"^_+", 1)]
#[test_case(Rule::UsedDummyVariable, Path::new("RUF052.py"), r"", 2)]
fn custom_regexp_preset(
Expand Down
129 changes: 112 additions & 17 deletions crates/ruff_linter/src/rules/ruff/rules/unraw_re_pattern.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use std::fmt::{Display, Formatter};
use std::str::FromStr;

use ruff_diagnostics::Applicability;
use ruff_macros::{ViolationMetadata, derive_message_formats};
use ruff_python_ast::{
BytesLiteral, Expr, ExprBytesLiteral, ExprCall, ExprStringLiteral, StringLiteral,
BytesLiteral, Expr, ExprBytesLiteral, ExprCall, ExprStringLiteral, PythonVersion, StringLiteral,
};
use ruff_python_semantic::{Modules, SemanticModel};

use ruff_text_size::Ranged;
use ruff_text_size::{Ranged, TextRange};

use crate::checkers::ast::Checker;
use crate::{Edit, Fix, FixAvailability, Violation};
Expand All @@ -24,6 +24,29 @@ use crate::{Edit, Fix, FixAvailability, Violation};
/// Regular expressions should be written
/// using raw strings to avoid double escaping.
///
/// ## Fix safety
/// The fix is unsafe if the string/bytes literal contains an escape sequence because the fix alters
/// the runtime value of the literal while retaining the regex semantics.
///
/// For example
/// ```python
/// # Literal is `1\n2`.
/// re.compile("1\n2")
///
/// # Literal is `1\\n2`, but the regex library will interpret `\\n` and will still match a newline
/// # character as before.
/// re.compile(r"1\n2")
/// ```
///
/// ## Fix availability
/// A fix is not available if either
/// * the argument is a string with a (no-op) `u` prefix (e.g., `u"foo"`) as the prefix is
/// incompatible with the raw prefix `r`
/// * the argument is a string or bytes literal with an escape sequence that has a different
/// meaning in the context of a regular expression such as `\b`, which is word boundary or
/// backspace in a regex, depending on the context, but always a backspace in string and bytes
/// literals.
///
/// ## Example
///
/// ```python
Expand Down Expand Up @@ -163,20 +186,44 @@ fn check_string(checker: &Checker, literal: &StringLiteral, module: RegexModule,
let range = literal.range;
let mut diagnostic = checker.report_diagnostic(UnrawRePattern { module, func, kind }, range);

if
// The (no-op) `u` prefix is a syntax error when combined with `r`
!literal.flags.prefix().is_unicode()
// We are looking for backslash characters
// in the raw source code here, because `\n`
// gets converted to a single character already
// at the lexing stage.
&&!checker.locator().slice(literal.range()).contains('\\')
{
diagnostic.set_fix(Fix::safe_edit(Edit::insertion(
"r".to_string(),
literal.range().start(),
)));
let Some(applicability) = raw_string_applicability(checker, literal) else {
return;
};

diagnostic.set_fix(Fix::applicable_edit(
Edit::insertion("r".to_string(), literal.range().start()),
applicability,
));
}

/// Check how safe it is to prepend the `r` prefix to the string.
///
/// ## Returns
/// * `None` if the prefix cannot be added,
/// * `Some(a)` if it can be added with applicability `a`.
fn raw_string_applicability(checker: &Checker, literal: &StringLiteral) -> Option<Applicability> {
if literal.flags.prefix().is_unicode() {
// The (no-op) `u` prefix is a syntax error when combined with `r`
return None;
}

if checker.target_version() >= PythonVersion::PY38 {
raw_applicability(checker, literal.range(), |escaped| {
matches!(
escaped,
Some('a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x' | 'N')
)
})
} else {
raw_applicability(checker, literal.range(), |escaped| {
matches!(
escaped,
Some('a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x')
)
})
}

// re.compile("\a\f\n\N{Partial Differential}\r\t\u27F2\U0001F0A1\v\x41") # with unsafe fix
}

fn check_bytes(checker: &Checker, literal: &BytesLiteral, module: RegexModule, func: &str) {
Expand All @@ -187,5 +234,53 @@ fn check_bytes(checker: &Checker, literal: &BytesLiteral, module: RegexModule, f
let kind = PatternKind::Bytes;
let func = func.to_string();
let range = literal.range;
checker.report_diagnostic(UnrawRePattern { module, func, kind }, range);
let mut diagnostic = checker.report_diagnostic(UnrawRePattern { module, func, kind }, range);

let Some(applicability) = raw_byte_applicability(checker, literal) else {
return;
};

diagnostic.set_fix(Fix::applicable_edit(
Edit::insertion("r".to_string(), literal.range().start()),
applicability,
));
}

/// Check how same it is to prepend the `r` prefix to the byte sting.
///
/// ## Returns
/// * `None` if the prefix cannot be added,
/// * `Some(a)` if it can be added with applicability `a`.
fn raw_byte_applicability(checker: &Checker, literal: &BytesLiteral) -> Option<Applicability> {
raw_applicability(checker, literal.range(), |escaped| {
matches!(escaped, Some('a' | 'f' | 'n' | 'r' | 't' | 'v' | 'x'))
})
}

fn raw_applicability(
checker: &Checker,
literal_range: TextRange,
match_allowed_escape_sequence: impl Fn(Option<char>) -> bool,
) -> Option<Applicability> {
let mut found_slash = false;
let mut chars = checker.locator().slice(literal_range).chars().peekable();
while let Some(char) = chars.next() {
if char == '\\' {
found_slash = true;
// Turning `"\uXXXX"` into `r"\uXXXX"` is behaviorally equivalent when passed
// to `re`, however, it's not exactly the same runtime value.
// Similarly, for the other escape sequences.
if !match_allowed_escape_sequence(chars.peek().copied()) {
// If the next character is not one of the whitelisted ones, we likely cannot safely turn
// this into a raw string.
return None;
}
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is some related code that I wrote for another rule:

for (c, next) in lit.value.chars().tuple_windows() {
// `\0` (or any other ASCII digit) and `\g` have special meaning in `repl` strings.
// Meanwhile, nearly all other escapes of ASCII letters in a `repl` string causes
// `re.PatternError` to be raised at runtime.
//
// If we see that the escaped character is an alphanumeric ASCII character,
// we should only emit a diagnostic suggesting to replace the `re.sub()` call with
// `str.replace`if we can detect that the escaped character is one that is both
// valid in a `repl` string *and* does not have any special meaning in a REPL string.
//
// It's out of scope for this rule to change invalid `re.sub()` calls into something
// that would not raise an exception at runtime. They should be left as-is.
if c == '\\' && next.is_ascii_alphanumeric() {
if "abfnrtv".contains(next) {
fixable = false;
} else {
return None;
}
}
}

It looks like we're handling a slightly different set of characters there: abfnrtv. b is present there but not here and x is here but not there, for example. Is that expected, or should the two be the same?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kind of like this str::contains check more than having to pass a closure. Then we could avoid calling the checker.target_version() repeatedly too, not that it's too expensive. For example:

let allowed = if checker.target_version() >= PythonVersion::PY38 {
    "afnrtuUvxN"
} else {
    "afnrtuUvx"
};

and for bytes it would just be "afnrtvx".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also this section in the docs (just above the linked node):

Octal escapes are included in a limited form. If the first digit is a 0, or if there are three octal digits, it is considered an octal escape. Otherwise, it is a group reference. As for string literals, octal escapes are always at most three digits in length.

This might be too niche to worry about, if it's relevant at all. I see there's a test case with an octal escape, so this seems intentional.

Copy link
Contributor Author

@robsdedude robsdedude Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kind of like this str::contains check more

If you prefer the str::contains approach for readability, I'll follow whatever you prefer as this is your code base to maintain. If you worry about performance: match is often faster. Pulling the checker.target_version() out of the closure is still possible and since Rust will monomorphize raw_applicability with the closure/anonymous function the compiler can optimize things to its heart's content (as opposed to receiving an opaque str and calling contains on it (see godbolt example).

Copy link
Contributor Author

@robsdedude robsdedude Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that expected, or should the two be the same?

I'm unsure. The linked code sure is different. If I'm interpreting it right, it's trying to go from re.sub to str.replace. While in this PR we're trying to go from re.some_func("some str") to re.some_func(r"some str"). So here we're always staying in the regex world. It's just a question of how many \es the string will end up having. My gut feeling is that those cases are not the same. But as said, unsure.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be too niche to worry about

Yeah... I didn't want to go there 😅 I mean it wouldn't be hard to implement, but probably slightly less efficient because the search window would grow from 2 to 4 characters.

I'll leave it up to you to decide if you want me to go for it or not.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You've convinced me :) let's stick with this version.


Some(if found_slash {
Applicability::Unsafe
} else {
Applicability::Safe
})
}
Loading
Loading