Implement unicode escape decoding

frenzymadness · frenzymadness · commit 2ef732667ddb · 2026-02-27T10:32:37.000+01:00
Unicode escapes in CSS were not properly decoded before security
checks. This prevents attackers from bypassing filters using
escape sequences.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -6,6 +6,13 @@ lxml_html_clean changelog
 Unreleased
 ==========
 
+Bugs fixed
+----------
+
+* Fixed a bug where Unicode escapes in CSS were not properly decoded
+  before security checks. This prevents attackers from bypassing filters
+  using escape sequences.
+
 0.4.3 (2025-10-02)
 ==================
 
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
@@ -578,6 +578,26 @@ def _remove_javascript_link(self, link):
     _comments_re = re.compile(r'/\*.*?\*/', re.S)
     _find_comments = _comments_re.finditer
     _substitute_comments = _comments_re.sub
+    _css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?')
+
+    def _decode_css_unicode_escapes(self, style):
+        """
+        Decode CSS Unicode escape sequences like \\69 or \\000069 to their
+        actual character values. This prevents bypassing security checks
+        using CSS escape sequences.
+
+        CSS escape syntax: backslash followed by 1-6 hex digits,
+        optionally followed by a whitespace character.
+        """
+        def replace_escape(match):
+            hex_value = match.group(1)
+            try:
+                return chr(int(hex_value, 16))
+            except (ValueError, OverflowError):
+                # Invalid unicode codepoint, keep original
+                return match.group(0)
+
+        return self._css_unicode_escape_re.sub(replace_escape, style)
 
     def _has_sneaky_javascript(self, style):
         """
@@ -591,7 +611,7 @@ def _has_sneaky_javascript(self, style):
         more sneaky attempts.
         """
         style = self._substitute_comments('', style)
-        style = style.replace('\\', '')
+        style = self._decode_css_unicode_escapes(style)
         style = _substitute_whitespace('', style)
         style = style.lower()
         if _has_javascript_scheme(style):
diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -393,3 +393,51 @@ def test_possibly_invalid_url_without_whitelist(self):
             self.assertEqual(len(w), 0)
         self.assertNotIn("google.com", result)
         self.assertNotIn("example.com", result)
+
+    def test_unicode_escape_in_style(self):
+        # Test that CSS Unicode escapes are properly decoded before security checks
+        # This prevents attackers from bypassing filters using escape sequences
+        # CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits)
+
+        # Test inline style attributes (requires safe_attrs_only=False)
+        cleaner = Cleaner(safe_attrs_only=False)
+        inline_style_cases = [
+            # \6a\61\76\61\73\63\72\69\70\74 = "javascript"
+            ('<div style="background: url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))">test</div>', '<div>test</div>'),
+            # \69 = 'i', so \69mport = "import"
+            ('<div style="@\\69mport url(evil.css)">test</div>', '<div>test</div>'),
+            # \69 with space after = 'i', space consumed as part of escape
+            ('<div style="@\\69 mport url(evil.css)">test</div>', '<div>test</div>'),
+            # \65\78\70\72\65\73\73\69\6f\6e = "expression"
+            ('<div style="\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))">test</div>', '<div>test</div>'),
+        ]
+
+        for html, expected in inline_style_cases:
+            with self.subTest(html=html):
+                cleaned = cleaner.clean_html(html)
+                self.assertEqual(expected, cleaned)
+
+        # Test <style> tag content (uses default clean_html)
+        style_tag_cases = [
+            # Unicode-escaped "javascript:" in url()
+            '<style>url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # Unicode-escaped "javascript:" without url()
+            '<style>\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1)</style>',
+            # Unicode-escaped "expression"
+            '<style>\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
+            # Unicode-escaped @import with 'i'
+            '<style>@\\69mport url(evil.css)</style>',
+            # Unicode-escaped "data:" scheme
+            '<style>url(\\64\\61\\74\\61:image/svg+xml;base64,PHN2ZyBvbmxvYWQ9YWxlcnQoMSk+)</style>',
+            # Space after escape is consumed: \69 mport = "import"
+            '<style>@\\69 mport url(evil.css)</style>',
+            # 6-digit escape: \000069 = 'i'
+            '<style>@\\000069mport url(evil.css)</style>',
+            # 6-digit escape with space
+            '<style>@\\000069 mport url(evil.css)</style>',
+        ]
+
+        for html in style_tag_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)