11use std:: fmt:: { Display , Formatter } ;
22use std:: str:: FromStr ;
33
4+ use ruff_diagnostics:: Applicability ;
45use ruff_macros:: { ViolationMetadata , derive_message_formats} ;
56use ruff_python_ast:: {
6- BytesLiteral , Expr , ExprBytesLiteral , ExprCall , ExprStringLiteral , StringLiteral ,
7+ BytesLiteral , Expr , ExprBytesLiteral , ExprCall , ExprStringLiteral , PythonVersion , StringLiteral ,
78} ;
89use ruff_python_semantic:: { Modules , SemanticModel } ;
9-
10- use ruff_text_size:: Ranged ;
10+ use ruff_text_size:: { Ranged , TextRange } ;
1111
1212use crate :: checkers:: ast:: Checker ;
1313use crate :: { Edit , Fix , FixAvailability , Violation } ;
@@ -24,6 +24,29 @@ use crate::{Edit, Fix, FixAvailability, Violation};
2424/// Regular expressions should be written
2525/// using raw strings to avoid double escaping.
2626///
27+ /// ## Fix safety
28+ /// The fix is unsafe if the string/bytes literal contains an escape sequence because the fix alters
29+ /// the runtime value of the literal while retaining the regex semantics.
30+ ///
31+ /// For example
32+ /// ```python
33+ /// # Literal is `1\n2`.
34+ /// re.compile("1\n2")
35+ ///
36+ /// # Literal is `1\\n2`, but the regex library will interpret `\\n` and will still match a newline
37+ /// # character as before.
38+ /// re.compile(r"1\n2")
39+ /// ```
40+ ///
41+ /// ## Fix availability
42+ /// A fix is not available if either
43+ /// * the argument is a string with a (no-op) `u` prefix (e.g., `u"foo"`) as the prefix is
44+ /// incompatible with the raw prefix `r`
45+ /// * the argument is a string or bytes literal with an escape sequence that has a different
46+ /// meaning in the context of a regular expression such as `\b`, which is word boundary or
47+ /// backspace in a regex, depending on the context, but always a backspace in string and bytes
48+ /// literals.
49+ ///
2750/// ## Example
2851///
2952/// ```python
@@ -163,20 +186,44 @@ fn check_string(checker: &Checker, literal: &StringLiteral, module: RegexModule,
163186 let range = literal. range ;
164187 let mut diagnostic = checker. report_diagnostic ( UnrawRePattern { module, func, kind } , range) ;
165188
166- if
167- // The (no-op) `u` prefix is a syntax error when combined with `r`
168- !literal. flags . prefix ( ) . is_unicode ( )
169- // We are looking for backslash characters
170- // in the raw source code here, because `\n`
171- // gets converted to a single character already
172- // at the lexing stage.
173- &&!checker. locator ( ) . slice ( literal. range ( ) ) . contains ( '\\' )
174- {
175- diagnostic. set_fix ( Fix :: safe_edit ( Edit :: insertion (
176- "r" . to_string ( ) ,
177- literal. range ( ) . start ( ) ,
178- ) ) ) ;
189+ let Some ( applicability) = raw_string_applicability ( checker, literal) else {
190+ return ;
191+ } ;
192+
193+ diagnostic. set_fix ( Fix :: applicable_edit (
194+ Edit :: insertion ( "r" . to_string ( ) , literal. range ( ) . start ( ) ) ,
195+ applicability,
196+ ) ) ;
197+ }
198+
199+ /// Check how safe it is to prepend the `r` prefix to the string.
200+ ///
201+ /// ## Returns
202+ /// * `None` if the prefix cannot be added,
203+ /// * `Some(a)` if it can be added with applicability `a`.
204+ fn raw_string_applicability ( checker : & Checker , literal : & StringLiteral ) -> Option < Applicability > {
205+ if literal. flags . prefix ( ) . is_unicode ( ) {
206+ // The (no-op) `u` prefix is a syntax error when combined with `r`
207+ return None ;
179208 }
209+
210+ if checker. target_version ( ) >= PythonVersion :: PY38 {
211+ raw_applicability ( checker, literal. range ( ) , |escaped| {
212+ matches ! (
213+ escaped,
214+ Some ( 'a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x' | 'N' )
215+ )
216+ } )
217+ } else {
218+ raw_applicability ( checker, literal. range ( ) , |escaped| {
219+ matches ! (
220+ escaped,
221+ Some ( 'a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x' )
222+ )
223+ } )
224+ }
225+
226+ // re.compile("\a\f\n\N{Partial Differential}\r\t\u27F2\U0001F0A1\v\x41") # with unsafe fix
180227}
181228
182229fn check_bytes ( checker : & Checker , literal : & BytesLiteral , module : RegexModule , func : & str ) {
@@ -187,5 +234,53 @@ fn check_bytes(checker: &Checker, literal: &BytesLiteral, module: RegexModule, f
187234 let kind = PatternKind :: Bytes ;
188235 let func = func. to_string ( ) ;
189236 let range = literal. range ;
190- checker. report_diagnostic ( UnrawRePattern { module, func, kind } , range) ;
237+ let mut diagnostic = checker. report_diagnostic ( UnrawRePattern { module, func, kind } , range) ;
238+
239+ let Some ( applicability) = raw_byte_applicability ( checker, literal) else {
240+ return ;
241+ } ;
242+
243+ diagnostic. set_fix ( Fix :: applicable_edit (
244+ Edit :: insertion ( "r" . to_string ( ) , literal. range ( ) . start ( ) ) ,
245+ applicability,
246+ ) ) ;
247+ }
248+
249+ /// Check how same it is to prepend the `r` prefix to the byte sting.
250+ ///
251+ /// ## Returns
252+ /// * `None` if the prefix cannot be added,
253+ /// * `Some(a)` if it can be added with applicability `a`.
254+ fn raw_byte_applicability ( checker : & Checker , literal : & BytesLiteral ) -> Option < Applicability > {
255+ raw_applicability ( checker, literal. range ( ) , |escaped| {
256+ matches ! ( escaped, Some ( 'a' | 'f' | 'n' | 'r' | 't' | 'v' | 'x' ) )
257+ } )
258+ }
259+
260+ fn raw_applicability (
261+ checker : & Checker ,
262+ literal_range : TextRange ,
263+ match_allowed_escape_sequence : impl Fn ( Option < char > ) -> bool ,
264+ ) -> Option < Applicability > {
265+ let mut found_slash = false ;
266+ let mut chars = checker. locator ( ) . slice ( literal_range) . chars ( ) . peekable ( ) ;
267+ while let Some ( char) = chars. next ( ) {
268+ if char == '\\' {
269+ found_slash = true ;
270+ // Turning `"\uXXXX"` into `r"\uXXXX"` is behaviorally equivalent when passed
271+ // to `re`, however, it's not exactly the same runtime value.
272+ // Similarly, for the other escape sequences.
273+ if !match_allowed_escape_sequence ( chars. peek ( ) . copied ( ) ) {
274+ // If the next character is not one of the whitelisted ones, we likely cannot safely turn
275+ // this into a raw string.
276+ return None ;
277+ }
278+ }
279+ }
280+
281+ Some ( if found_slash {
282+ Applicability :: Unsafe
283+ } else {
284+ Applicability :: Safe
285+ } )
191286}
0 commit comments