11use std:: fmt:: { Display , Formatter } ;
22use std:: str:: FromStr ;
33
4+ use ruff_diagnostics:: Applicability ;
45use ruff_macros:: { ViolationMetadata , derive_message_formats} ;
56use ruff_python_ast:: {
6- BytesLiteral , Expr , ExprBytesLiteral , ExprCall , ExprStringLiteral , StringLiteral ,
7+ BytesLiteral , Expr , ExprBytesLiteral , ExprCall , ExprStringLiteral , PythonVersion , StringLiteral ,
78} ;
89use ruff_python_semantic:: { Modules , SemanticModel } ;
9-
10- use ruff_text_size:: Ranged ;
10+ use ruff_text_size:: { Ranged , TextRange } ;
1111
1212use crate :: checkers:: ast:: Checker ;
1313use crate :: { Edit , Fix , FixAvailability , Violation } ;
@@ -24,6 +24,21 @@ use crate::{Edit, Fix, FixAvailability, Violation};
2424/// Regular expressions should be written
2525/// using raw strings to avoid double escaping.
2626///
27+ /// ## Fix safety
28+ /// If a fix is available and the string/bytes literal contains an escape sequence, the fix is
29+ /// marked unsafe because it alters the runtime value of the literal while retaining the regex
30+ /// semantics.
31+ ///
32+ /// For example
33+ /// ```python
34+ /// # Literal is `1\n2`.
35+ /// re.compile("1\n2")
36+ ///
37+ /// # Literal is `1\\n2`, but the regex library will interpret `\\n` and will still match a newline
38+ /// # character as before.
39+ /// re.compile(r"1\n2")
40+ /// ```
41+ ///
2742/// ## Example
2843///
2944/// ```python
@@ -163,20 +178,35 @@ fn check_string(checker: &Checker, literal: &StringLiteral, module: RegexModule,
163178 let range = literal. range ;
164179 let mut diagnostic = checker. report_diagnostic ( UnrawRePattern { module, func, kind } , range) ;
165180
166- if
167- // The (no-op) `u` prefix is a syntax error when combined with `r`
168- !literal. flags . prefix ( ) . is_unicode ( )
169- // We are looking for backslash characters
170- // in the raw source code here, because `\n`
171- // gets converted to a single character already
172- // at the lexing stage.
173- &&!checker. locator ( ) . slice ( literal. range ( ) ) . contains ( '\\' )
174- {
175- diagnostic. set_fix ( Fix :: safe_edit ( Edit :: insertion (
176- "r" . to_string ( ) ,
177- literal. range ( ) . start ( ) ,
178- ) ) ) ;
181+ let Some ( applicability) = raw_string_applicability ( checker, literal) else {
182+ return ;
183+ } ;
184+
185+ diagnostic. set_fix ( Fix :: applicable_edit (
186+ Edit :: insertion ( "r" . to_string ( ) , literal. range ( ) . start ( ) ) ,
187+ applicability,
188+ ) ) ;
189+ }
190+
191+ /// Check how same it is to prepend the `r` prefix to the sting.
192+ ///
193+ /// ## Returns
194+ /// * `None` if the prefix cannot be added,
195+ /// * `Some(a)` if it can be added with applicability `a`.
196+ fn raw_string_applicability ( checker : & Checker , literal : & StringLiteral ) -> Option < Applicability > {
197+ if literal. flags . prefix ( ) . is_unicode ( ) {
198+ // The (no-op) `u` prefix is a syntax error when combined with `r`
199+ return None ;
179200 }
201+
202+ raw_applicability ( checker, literal. range ( ) , |escaped| {
203+ matches ! (
204+ escaped,
205+ Some ( 'a' | 'f' | 'n' | 'r' | 't' | 'u' | 'U' | 'v' | 'x' )
206+ ) || checker. target_version ( ) >= PythonVersion :: PY38 && escaped. is_some_and ( |c| c == 'N' )
207+ } )
208+
209+ // re.compile("\a\f\n\N{Partial Differential}\r\t\u27F2\U0001F0A1\v\x41") # with unsafe fix
180210}
181211
182212fn check_bytes ( checker : & Checker , literal : & BytesLiteral , module : RegexModule , func : & str ) {
@@ -187,5 +217,53 @@ fn check_bytes(checker: &Checker, literal: &BytesLiteral, module: RegexModule, f
187217 let kind = PatternKind :: Bytes ;
188218 let func = func. to_string ( ) ;
189219 let range = literal. range ;
190- checker. report_diagnostic ( UnrawRePattern { module, func, kind } , range) ;
220+ let mut diagnostic = checker. report_diagnostic ( UnrawRePattern { module, func, kind } , range) ;
221+
222+ let Some ( applicability) = raw_byte_applicability ( checker, literal) else {
223+ return ;
224+ } ;
225+
226+ diagnostic. set_fix ( Fix :: applicable_edit (
227+ Edit :: insertion ( "r" . to_string ( ) , literal. range ( ) . start ( ) ) ,
228+ applicability,
229+ ) ) ;
230+ }
231+
232+ /// Check how same it is to prepend the `r` prefix to the byte sting.
233+ ///
234+ /// ## Returns
235+ /// * `None` if the prefix cannot be added,
236+ /// * `Some(a)` if it can be added with applicability `a`.
237+ fn raw_byte_applicability ( checker : & Checker , literal : & BytesLiteral ) -> Option < Applicability > {
238+ raw_applicability ( checker, literal. range ( ) , |escaped| {
239+ matches ! ( escaped, Some ( 'a' | 'f' | 'n' | 'r' | 't' | 'v' | 'x' ) )
240+ } )
241+ }
242+
243+ fn raw_applicability (
244+ checker : & Checker ,
245+ literal_range : TextRange ,
246+ match_allowed_escape_sequence : impl Fn ( Option < char > ) -> bool ,
247+ ) -> Option < Applicability > {
248+ let mut found_slash = false ;
249+ let mut chars = checker. locator ( ) . slice ( literal_range) . chars ( ) . peekable ( ) ;
250+ while let Some ( char) = chars. next ( ) {
251+ if char == '\\' {
252+ found_slash = true ;
253+ // Turning `"\uXXXX"` into `r"\uXXXX"` is behaviorally equivalent when passed
254+ // to `re`, however, it's not exactly the same runtime value.
255+ // Similarly, for the other escape sequences.
256+ if !match_allowed_escape_sequence ( chars. peek ( ) . copied ( ) ) {
257+ // If the next character is not one of whitelisted one, we likely cannot safely turn
258+ // this into a raw string.
259+ return None ;
260+ }
261+ }
262+ }
263+
264+ Some ( if found_slash {
265+ Applicability :: Unsafe
266+ } else {
267+ Applicability :: Safe
268+ } )
191269}
0 commit comments