Skip to content

Commit 2ef7326

Browse files
committed
Implement unicode escape decoding
Unicode escapes in CSS were not properly decoded before security checks. This prevents attackers from bypassing filters using escape sequences.
1 parent 7c854af commit 2ef7326

File tree

3 files changed

+76
-1
lines changed

3 files changed

+76
-1
lines changed

CHANGES.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ lxml_html_clean changelog
66
Unreleased
77
==========
88

9+
Bugs fixed
10+
----------
11+
12+
* Fixed a bug where Unicode escapes in CSS were not properly decoded
13+
before security checks. This prevents attackers from bypassing filters
14+
using escape sequences.
15+
916
0.4.3 (2025-10-02)
1017
==================
1118

lxml_html_clean/clean.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,26 @@ def _remove_javascript_link(self, link):
578578
_comments_re = re.compile(r'/\*.*?\*/', re.S)
579579
_find_comments = _comments_re.finditer
580580
_substitute_comments = _comments_re.sub
581+
_css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?')
582+
583+
def _decode_css_unicode_escapes(self, style):
584+
"""
585+
Decode CSS Unicode escape sequences like \\69 or \\000069 to their
586+
actual character values. This prevents bypassing security checks
587+
using CSS escape sequences.
588+
589+
CSS escape syntax: backslash followed by 1-6 hex digits,
590+
optionally followed by a whitespace character.
591+
"""
592+
def replace_escape(match):
593+
hex_value = match.group(1)
594+
try:
595+
return chr(int(hex_value, 16))
596+
except (ValueError, OverflowError):
597+
# Invalid unicode codepoint, keep original
598+
return match.group(0)
599+
600+
return self._css_unicode_escape_re.sub(replace_escape, style)
581601

582602
def _has_sneaky_javascript(self, style):
583603
"""
@@ -591,7 +611,7 @@ def _has_sneaky_javascript(self, style):
591611
more sneaky attempts.
592612
"""
593613
style = self._substitute_comments('', style)
594-
style = style.replace('\\', '')
614+
style = self._decode_css_unicode_escapes(style)
595615
style = _substitute_whitespace('', style)
596616
style = style.lower()
597617
if _has_javascript_scheme(style):

tests/test_clean.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,3 +393,51 @@ def test_possibly_invalid_url_without_whitelist(self):
393393
self.assertEqual(len(w), 0)
394394
self.assertNotIn("google.com", result)
395395
self.assertNotIn("example.com", result)
396+
397+
def test_unicode_escape_in_style(self):
398+
# Test that CSS Unicode escapes are properly decoded before security checks
399+
# This prevents attackers from bypassing filters using escape sequences
400+
# CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits)
401+
402+
# Test inline style attributes (requires safe_attrs_only=False)
403+
cleaner = Cleaner(safe_attrs_only=False)
404+
inline_style_cases = [
405+
# \6a\61\76\61\73\63\72\69\70\74 = "javascript"
406+
('<div style="background: url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))">test</div>', '<div>test</div>'),
407+
# \69 = 'i', so \69mport = "import"
408+
('<div style="@\\69mport url(evil.css)">test</div>', '<div>test</div>'),
409+
# \69 with space after = 'i', space consumed as part of escape
410+
('<div style="@\\69 mport url(evil.css)">test</div>', '<div>test</div>'),
411+
# \65\78\70\72\65\73\73\69\6f\6e = "expression"
412+
('<div style="\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))">test</div>', '<div>test</div>'),
413+
]
414+
415+
for html, expected in inline_style_cases:
416+
with self.subTest(html=html):
417+
cleaned = cleaner.clean_html(html)
418+
self.assertEqual(expected, cleaned)
419+
420+
# Test <style> tag content (uses default clean_html)
421+
style_tag_cases = [
422+
# Unicode-escaped "javascript:" in url()
423+
'<style>url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))</style>',
424+
# Unicode-escaped "javascript:" without url()
425+
'<style>\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1)</style>',
426+
# Unicode-escaped "expression"
427+
'<style>\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
428+
# Unicode-escaped @import with 'i'
429+
'<style>@\\69mport url(evil.css)</style>',
430+
# Unicode-escaped "data:" scheme
431+
'<style>url(\\64\\61\\74\\61:image/svg+xml;base64,PHN2ZyBvbmxvYWQ9YWxlcnQoMSk+)</style>',
432+
# Space after escape is consumed: \69 mport = "import"
433+
'<style>@\\69 mport url(evil.css)</style>',
434+
# 6-digit escape: \000069 = 'i'
435+
'<style>@\\000069mport url(evil.css)</style>',
436+
# 6-digit escape with space
437+
'<style>@\\000069 mport url(evil.css)</style>',
438+
]
439+
440+
for html in style_tag_cases:
441+
with self.subTest(html=html):
442+
cleaned = clean_html(html)
443+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

0 commit comments

Comments
 (0)