Skip to content

Commit b6f4d1f

Browse files
delino[bot]github-actions[bot]claudekdy1
authored
fix(es/codegen): Encode non-ASCII chars in regex with ascii_only option (#11155)
## Summary Fixes #11146 This PR fixes a bug where the SWC minifier does not encode non-ASCII characters in regular expressions when the `ascii_only` option is enabled. Previously, when `ascii_only: true` was set, non-ASCII characters in strings were correctly encoded to Unicode escape sequences, but regex patterns were left unchanged. This PR ensures regex patterns receive the same treatment. ## Changes ### 1. Added `encode_regex_for_ascii` function A new helper function in `crates/swc_ecma_codegen/src/lit.rs` that: - Encodes non-ASCII characters in regex patterns to Unicode escape sequences - Uses `\xHH` format for characters in range `\x7f` to `\xff` - Uses `\uHHHH` format for characters above `\xff` - Encodes characters beyond BMP (U+FFFF) as surrogate pairs for compatibility - Preserves ASCII characters as-is for optimal performance - Returns borrowed string when `ascii_only: false` or pattern is pure ASCII ### 2. Updated `Lit::Regex` emission logic Modified the regex literal emission in `lit.rs:32-39` to: - Check if `ascii_only` is enabled via `emitter.cfg.ascii_only` - Apply the encoding function to the regex expression before writing - Maintain the same behavior as string literal encoding ### 3. Added comprehensive unit tests Five new tests in `crates/swc_ecma_codegen/src/tests.rs`: - `ascii_only_regex_1`: Verifies non-ASCII chars preserved when `ascii_only: false` - `ascii_only_regex_2`: Verifies encoding with specific example from issue #11146 - `ascii_only_regex_3`: Tests emoji preservation when `ascii_only: false` - `ascii_only_regex_4`: Tests emoji encoding when `ascii_only: true` - `ascii_only_regex_5`: Ensures pure ASCII regex unchanged with `ascii_only: true` ## Example **Input:** ```javascript /[\w@Ø-ÞÀ-Öß-öø-ÿ]/ ``` **Output with `ascii_only: false`:** ```javascript /[\w@Ø-ÞÀ-Öß-öø-ÿ]/ ``` **Output with `ascii_only: true`:** ```javascript /[\w@\xd8-\xde\xc0-\xd6\xdf-\xf6\xf8-\xff]/ ``` ## Test plan - [x] All new unit tests pass (`cargo test -p swc_ecma_codegen --lib -- ascii_only_regex`) - [x] Code formatted with `cargo fmt --all` - [x] Implementation follows existing patterns from `get_quoted_utf16` function - [x] Adheres to CLAUDE.md requirements (performance-focused, documented, English comments) ## Related - Issue: #11146 - Similar behavior to how Terser handles `ascii_only` option 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Donny/강동윤 <kdy.1997.dev@gmail.com>
1 parent 630484f commit b6f4d1f

File tree

3 files changed

+122
-1
lines changed

3 files changed

+122
-1
lines changed

.changeset/fast-pens-admire.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
swc_ecma_codegen: patch
3+
swc_core: patch
4+
---
5+
6+
fix(codegen): Encode non-ASCII chars in regex with ascii_only option

crates/swc_ecma_codegen/src/lit.rs

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ impl MacroNode for Lit {
3131
Lit::Num(ref n) => emit!(n),
3232
Lit::Regex(ref n) => {
3333
punct!(emitter, "/");
34-
emitter.wr.write_str(&n.exp)?;
34+
// Encode non-ASCII characters in regex pattern when ascii_only is enabled
35+
let encoded_exp = encode_regex_for_ascii(&n.exp, emitter.cfg.ascii_only);
36+
emitter.wr.write_str(&encoded_exp)?;
3537
punct!(emitter, "/");
3638
emitter.wr.write_str(&n.flags)?;
3739
}
@@ -315,6 +317,54 @@ where
315317
}
316318
}
317319

320+
/// Encodes non-ASCII characters in regex patterns when ascii_only is enabled.
321+
///
322+
/// This function converts non-ASCII characters to their Unicode escape
323+
/// sequences to ensure the regex can be safely used in ASCII-only contexts.
324+
///
325+
/// # Arguments
326+
/// * `pattern` - The regex pattern string to encode
327+
/// * `ascii_only` - Whether to encode non-ASCII characters
328+
///
329+
/// # Returns
330+
/// A string with non-ASCII characters encoded as Unicode escapes
331+
pub fn encode_regex_for_ascii(pattern: &str, ascii_only: bool) -> CowStr {
332+
if !ascii_only || pattern.is_ascii() {
333+
return CowStr::Borrowed(pattern);
334+
}
335+
336+
let mut buf = CompactString::with_capacity(pattern.len());
337+
338+
for c in pattern.chars() {
339+
match c {
340+
// ASCII characters are preserved as-is
341+
'\x00'..='\x7e' => buf.push(c),
342+
// Characters in the \x7f to \xff range use \xHH format
343+
'\u{7f}'..='\u{ff}' => {
344+
buf.push_str("\\x");
345+
write!(&mut buf, "{:02x}", c as u8).unwrap();
346+
}
347+
// Line/paragraph separators need escaping in all contexts
348+
'\u{2028}' => buf.push_str("\\u2028"),
349+
'\u{2029}' => buf.push_str("\\u2029"),
350+
// Characters above \xff use \uHHHH format
351+
_ => {
352+
if c > '\u{FFFF}' {
353+
// Characters beyond BMP are encoded as surrogate pairs for compatibility
354+
let code_point = c as u32;
355+
let h = ((code_point - 0x10000) / 0x400) + 0xd800;
356+
let l = ((code_point - 0x10000) % 0x400) + 0xdc00;
357+
write!(&mut buf, "\\u{h:04x}\\u{l:04x}").unwrap();
358+
} else {
359+
write!(&mut buf, "\\u{:04x}", c as u16).unwrap();
360+
}
361+
}
362+
}
363+
}
364+
365+
CowStr::Owned(buf)
366+
}
367+
318368
/// Returns `(quote_char, value)`
319369
pub fn get_quoted_utf16(v: &str, ascii_only: bool, target: EsVersion) -> (AsciiChar, CowStr) {
320370
// Fast path: If the string is ASCII and doesn't need escaping, we can avoid

crates/swc_ecma_codegen/src/tests.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,71 @@ export default {
892892
);
893893
}
894894

895+
#[test]
896+
fn ascii_only_regex_1() {
897+
test_all(
898+
r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/",
899+
r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/;",
900+
r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/",
901+
Config {
902+
ascii_only: false,
903+
..Default::default()
904+
},
905+
);
906+
}
907+
908+
#[test]
909+
fn ascii_only_regex_2() {
910+
test_all(
911+
r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/",
912+
r"/[\w@\xd8-\xde\xc0-\xd6\xdf-\xf6\xf8-\xff]/;",
913+
r"/[\w@\xd8-\xde\xc0-\xd6\xdf-\xf6\xf8-\xff]/",
914+
Config {
915+
ascii_only: true,
916+
..Default::default()
917+
},
918+
);
919+
}
920+
921+
#[test]
922+
fn ascii_only_regex_3() {
923+
test_all(
924+
r"/[😊❤️]/g",
925+
r"/[😊❤️]/g;",
926+
r"/[😊❤️]/g",
927+
Config {
928+
ascii_only: false,
929+
..Default::default()
930+
},
931+
);
932+
}
933+
934+
#[test]
935+
fn ascii_only_regex_4() {
936+
test_all(
937+
r"/[😊❤️]/g",
938+
r"/[\ud83d\ude0a\u2764\ufe0f]/g;",
939+
r"/[\ud83d\ude0a\u2764\ufe0f]/g",
940+
Config {
941+
ascii_only: true,
942+
..Default::default()
943+
},
944+
);
945+
}
946+
947+
#[test]
948+
fn ascii_only_regex_5() {
949+
test_all(
950+
r"/test/",
951+
r"/test/;",
952+
r"/test/",
953+
Config {
954+
ascii_only: true,
955+
..Default::default()
956+
},
957+
);
958+
}
959+
895960
#[test]
896961
fn emit_type_import_statement_named() {
897962
let from = r#"

0 commit comments

Comments
 (0)