fix(es/codegen): Encode non-ASCII chars in regex with ascii_only option (#11155)

delino[bot] · github-actions[bot] · claude · web-flow · commit b6f4d1f8b76a · 2025-10-19T14:23:27.000Z
## Summary Fixes #11146 This PR fixes a bug where the SWC minifier does not encode non-ASCII characters in regular expressions when the `ascii_only` option is enabled. Previously, when `ascii_only: true` was set, non-ASCII characters in strings were correctly encoded to Unicode escape sequences, but regex patterns were left unchanged. This PR ensures regex patterns receive the same treatment. ## Changes ### 1. Added `encode_regex_for_ascii` function A new helper function in `crates/swc_ecma_codegen/src/lit.rs` that: - Encodes non-ASCII characters in regex patterns to Unicode escape sequences - Uses `\xHH` format for characters in range `\x7f` to `\xff` - Uses `\uHHHH` format for characters above `\xff` - Encodes characters beyond BMP (U+FFFF) as surrogate pairs for compatibility - Preserves ASCII characters as-is for optimal performance - Returns borrowed string when `ascii_only: false` or pattern is pure ASCII ### 2. Updated `Lit::Regex` emission logic Modified the regex literal emission in `lit.rs:32-39` to: - Check if `ascii_only` is enabled via `emitter.cfg.ascii_only` - Apply the encoding function to the regex expression before writing - Maintain the same behavior as string literal encoding ### 3. Added comprehensive unit tests Five new tests in `crates/swc_ecma_codegen/src/tests.rs`: - `ascii_only_regex_1`: Verifies non-ASCII chars preserved when `ascii_only: false` - `ascii_only_regex_2`: Verifies encoding with specific example from issue #11146 - `ascii_only_regex_3`: Tests emoji preservation when `ascii_only: false` - `ascii_only_regex_4`: Tests emoji encoding when `ascii_only: true` - `ascii_only_regex_5`: Ensures pure ASCII regex unchanged with `ascii_only: true` ## Example **Input:** ```javascript /[\w@Ø-ÞÀ-Öß-öø-ÿ]/ ``` **Output with `ascii_only: false`:** ```javascript /[\w@Ø-ÞÀ-Öß-öø-ÿ]/ ``` **Output with `ascii_only: true`:** ```javascript /[\w@\xd8-\xde\xc0-\xd6\xdf-\xf6\xf8-\xff]/ ``` ## Test plan - [x] All new unit tests pass (`cargo test -p swc_ecma_codegen --lib -- ascii_only_regex`) - [x] Code formatted with `cargo fmt --all` - [x] Implementation follows existing patterns from `get_quoted_utf16` function - [x] Adheres to CLAUDE.md requirements (performance-focused, documented, English comments) ## Related - Issue: #11146 - Similar behavior to how Terser handles `ascii_only` option 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Donny/강동윤 <kdy.1997.dev@gmail.com>
diff --git a/.changeset/fast-pens-admire.md b/.changeset/fast-pens-admire.md
@@ -0,0 +1,6 @@
+---
+swc_ecma_codegen: patch
+swc_core: patch
+---
+
+fix(codegen): Encode non-ASCII chars in regex with ascii_only option
diff --git a/crates/swc_ecma_codegen/src/lit.rs b/crates/swc_ecma_codegen/src/lit.rs
@@ -31,7 +31,9 @@ impl MacroNode for Lit {
             Lit::Num(ref n) => emit!(n),
             Lit::Regex(ref n) => {
                 punct!(emitter, "/");
-                emitter.wr.write_str(&n.exp)?;
+                // Encode non-ASCII characters in regex pattern when ascii_only is enabled
+                let encoded_exp = encode_regex_for_ascii(&n.exp, emitter.cfg.ascii_only);
+                emitter.wr.write_str(&encoded_exp)?;
                 punct!(emitter, "/");
                 emitter.wr.write_str(&n.flags)?;
             }
@@ -315,6 +317,54 @@ where
     }
 }
 
+/// Encodes non-ASCII characters in regex patterns when ascii_only is enabled.
+///
+/// This function converts non-ASCII characters to their Unicode escape
+/// sequences to ensure the regex can be safely used in ASCII-only contexts.
+///
+/// # Arguments
+/// * `pattern` - The regex pattern string to encode
+/// * `ascii_only` - Whether to encode non-ASCII characters
+///
+/// # Returns
+/// A string with non-ASCII characters encoded as Unicode escapes
+pub fn encode_regex_for_ascii(pattern: &str, ascii_only: bool) -> CowStr {
+    if !ascii_only || pattern.is_ascii() {
+        return CowStr::Borrowed(pattern);
+    }
+
+    let mut buf = CompactString::with_capacity(pattern.len());
+
+    for c in pattern.chars() {
+        match c {
+            // ASCII characters are preserved as-is
+            '\x00'..='\x7e' => buf.push(c),
+            // Characters in the \x7f to \xff range use \xHH format
+            '\u{7f}'..='\u{ff}' => {
+                buf.push_str("\\x");
+                write!(&mut buf, "{:02x}", c as u8).unwrap();
+            }
+            // Line/paragraph separators need escaping in all contexts
+            '\u{2028}' => buf.push_str("\\u2028"),
+            '\u{2029}' => buf.push_str("\\u2029"),
+            // Characters above \xff use \uHHHH format
+            _ => {
+                if c > '\u{FFFF}' {
+                    // Characters beyond BMP are encoded as surrogate pairs for compatibility
+                    let code_point = c as u32;
+                    let h = ((code_point - 0x10000) / 0x400) + 0xd800;
+                    let l = ((code_point - 0x10000) % 0x400) + 0xdc00;
+                    write!(&mut buf, "\\u{h:04x}\\u{l:04x}").unwrap();
+                } else {
+                    write!(&mut buf, "\\u{:04x}", c as u16).unwrap();
+                }
+            }
+        }
+    }
+
+    CowStr::Owned(buf)
+}
+
 /// Returns `(quote_char, value)`
 pub fn get_quoted_utf16(v: &str, ascii_only: bool, target: EsVersion) -> (AsciiChar, CowStr) {
     // Fast path: If the string is ASCII and doesn't need escaping, we can avoid
diff --git a/crates/swc_ecma_codegen/src/tests.rs b/crates/swc_ecma_codegen/src/tests.rs
@@ -892,6 +892,71 @@ export default {
     );
 }
 
+#[test]
+fn ascii_only_regex_1() {
+    test_all(
+        r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/",
+        r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/;",
+        r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/",
+        Config {
+            ascii_only: false,
+            ..Default::default()
+        },
+    );
+}
+
+#[test]
+fn ascii_only_regex_2() {
+    test_all(
+        r"/[\w@Ø-ÞÀ-Öß-öø-ÿ]/",
+        r"/[\w@\xd8-\xde\xc0-\xd6\xdf-\xf6\xf8-\xff]/;",
+        r"/[\w@\xd8-\xde\xc0-\xd6\xdf-\xf6\xf8-\xff]/",
+        Config {
+            ascii_only: true,
+            ..Default::default()
+        },
+    );
+}
+
+#[test]
+fn ascii_only_regex_3() {
+    test_all(
+        r"/[😊❤️]/g",
+        r"/[😊❤️]/g;",
+        r"/[😊❤️]/g",
+        Config {
+            ascii_only: false,
+            ..Default::default()
+        },
+    );
+}
+
+#[test]
+fn ascii_only_regex_4() {
+    test_all(
+        r"/[😊❤️]/g",
+        r"/[\ud83d\ude0a\u2764\ufe0f]/g;",
+        r"/[\ud83d\ude0a\u2764\ufe0f]/g",
+        Config {
+            ascii_only: true,
+            ..Default::default()
+        },
+    );
+}
+
+#[test]
+fn ascii_only_regex_5() {
+    test_all(
+        r"/test/",
+        r"/test/;",
+        r"/test/",
+        Config {
+            ascii_only: true,
+            ..Default::default()
+        },
+    );
+}
+
 #[test]
 fn emit_type_import_statement_named() {
     let from = r#"