Skip to content

Commit fd10d79

Browse files
committed
Add more tests for different combinations of backslashes and unicode
1 parent 5b7e228 commit fd10d79

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed

tests/test_clean.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,3 +489,99 @@ def test_unicode_escape_in_style(self):
489489
with self.subTest(html=html):
490490
cleaned = clean_html(html)
491491
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
492+
493+
def test_unicode_escape_mixed_with_comments(self):
494+
# Unicode escapes mixed with CSS comments should still be caught
495+
test_cases = [
496+
# \69 = 'i' with comment before
497+
'<style>@/*comment*/\\69mport url(evil.css)</style>',
498+
# \69 = 'i' with comment after
499+
'<style>@\\69mport/*comment*/ url(evil.css)</style>',
500+
# Multiple escapes with comments
501+
'<style>\\65\\78/*comment*/\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
502+
]
503+
504+
for html in test_cases:
505+
with self.subTest(html=html):
506+
cleaned = clean_html(html)
507+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
508+
509+
def test_unicode_escape_case_insensitive(self):
510+
# CSS hex escapes should work with both uppercase and lowercase hex digits
511+
# \69 = 'i', \6D = 'm', etc.
512+
test_cases = [
513+
# @import with uppercase hex digits: \69\6D\70\6F\72\74
514+
'<style>@\\69\\6D\\70\\6F\\72\\74 url(evil.css)</style>',
515+
# @import with some uppercase
516+
'<style>@\\69\\6D\\70\\6f\\72\\74 url(evil.css)</style>',
517+
]
518+
519+
for html in test_cases:
520+
with self.subTest(html=html):
521+
cleaned = clean_html(html)
522+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
523+
524+
def test_unicode_escape_various_schemes(self):
525+
# Test Unicode escapes for various malicious schemes
526+
test_cases = [
527+
# \76\62\73\63\72\69\70\74 = "vbscript"
528+
'<style>url(\\76\\62\\73\\63\\72\\69\\70\\74:alert(1))</style>',
529+
# \6a\73\63\72\69\70\74 = "jscript"
530+
'<style>url(\\6a\\73\\63\\72\\69\\70\\74:alert(1))</style>',
531+
# \6c\69\76\65\73\63\72\69\70\74 = "livescript"
532+
'<style>url(\\6c\\69\\76\\65\\73\\63\\72\\69\\70\\74:alert(1))</style>',
533+
# \6d\6f\63\68\61 = "mocha"
534+
'<style>url(\\6d\\6f\\63\\68\\61:alert(1))</style>',
535+
]
536+
537+
for html in test_cases:
538+
with self.subTest(html=html):
539+
cleaned = clean_html(html)
540+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
541+
542+
def test_unicode_escape_with_whitespace_variations(self):
543+
# Test different whitespace characters after Unicode escapes
544+
cleaner = Cleaner(safe_attrs_only=False)
545+
test_cases = [
546+
# Tab after escape
547+
('<div style="@\\69\tmport url(evil.css)">test</div>', '<div>test</div>'),
548+
# Newline after escape (note: actual newline, not \n)
549+
('<div style="@\\69\nmport url(evil.css)">test</div>', '<div>test</div>'),
550+
# Form feed after escape
551+
('<div style="@\\69\fmport url(evil.css)">test</div>', '<div>test</div>'),
552+
]
553+
554+
for html, expected in test_cases:
555+
with self.subTest(html=html):
556+
cleaned = cleaner.clean_html(html)
557+
self.assertEqual(expected, cleaned)
558+
559+
def test_backslash_removal_after_unicode_decode(self):
560+
# After decoding Unicode escapes, remaining backslashes are removed
561+
# This ensures double-obfuscation (unicode + backslashes) is caught
562+
test_cases = [
563+
# Step 1: \69 → 'i', Step 2: remove \, Result: @import
564+
'<style>@\\69\\m\\p\\o\\r\\t url(evil.css)</style>',
565+
# Multiple unicode escapes with backslashes mixed in
566+
'<style>@\\69\\6d\\p\\6f\\r\\t url(evil.css)</style>',
567+
]
568+
569+
for html in test_cases:
570+
with self.subTest(html=html):
571+
cleaned = clean_html(html)
572+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)
573+
574+
def test_backslash_obfuscation_without_unicode(self):
575+
# Test that patterns using ONLY backslash obfuscation (no unicode) are caught
576+
# Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern
577+
test_cases = [
578+
# @\i\m\p\o\r\t → @import (caught by '@import' check)
579+
'<style>@\\i\\m\\p\\o\\r\\t url(evil.css)</style>',
580+
# Can also test combinations that create javascript schemes
581+
'<style>@\\import url(evil.css)</style>',
582+
]
583+
584+
for html in test_cases:
585+
with self.subTest(html=html):
586+
cleaned = clean_html(html)
587+
self.assertEqual('<div><style>/* deleted */</style></div>', cleaned)

0 commit comments

Comments
 (0)