Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 76bff8a

Browse files
committed
Ignore invalid character encodings.
1 parent e70a592 commit 76bff8a

File tree

2 files changed

+33
-16
lines changed

2 files changed

+33
-16
lines changed

synapse/rest/media/v1/preview_url_resource.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -632,9 +632,12 @@ def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
632632
logger.debug("No media removed from url cache")
633633

634634

635-
def _normalise_encoding(encoding: str) -> str:
635+
def _normalise_encoding(encoding: str) -> Optional[str]:
636636
"""Use the Python codec's name as the normalised entry."""
637-
return codecs.lookup(encoding).name
637+
try:
638+
return codecs.lookup(encoding).name
639+
except LookupError:
640+
return None
638641

639642

640643
def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]:
@@ -668,16 +671,17 @@ def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterab
668671
match = _charset_match.search(body_start)
669672
if match:
670673
encoding = _normalise_encoding(match.group(1).decode("ascii"))
671-
attempted_encodings.add(encoding)
672-
yield encoding
674+
if encoding:
675+
attempted_encodings.add(encoding)
676+
yield encoding
673677

674678
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
675679

676680
# Check if it has an XML document with an encoding.
677681
match = _xml_encoding_match.match(body_start)
678682
if match:
679683
encoding = _normalise_encoding(match.group(1).decode("ascii"))
680-
if encoding not in attempted_encodings:
684+
if encoding and encoding not in attempted_encodings:
681685
attempted_encodings.add(encoding)
682686
yield encoding
683687

@@ -686,12 +690,12 @@ def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterab
686690
content_match = _content_type_match.match(content_type)
687691
if content_match:
688692
encoding = _normalise_encoding(content_match.group(1))
689-
if encoding not in attempted_encodings:
693+
if encoding and encoding not in attempted_encodings:
690694
attempted_encodings.add(encoding)
691695
yield encoding
692696

693697
# Finally, fallback to UTF-8, then windows-1252.
694-
for fallback in ("utf-8", "windows-1252"):
698+
for fallback in ("utf-8", "cp1252"):
695699
if fallback not in attempted_encodings:
696700
yield fallback
697701

tests/test_preview.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ def test_invalid_encoding2(self):
307307
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
308308

309309
def test_windows_1252(self):
310-
"""A body which uses windows-1252, but doesn't declare that."""
310+
"""A body which uses cp1252, but doesn't declare that."""
311311
html = b"""
312312
<html>
313313
<head><title>\xf3</title></head>
@@ -333,7 +333,7 @@ def test_meta_charset(self):
333333
""",
334334
"text/html",
335335
)
336-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
336+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
337337

338338
# A less well-formed version.
339339
encodings = get_html_media_encodings(
@@ -345,7 +345,7 @@ def test_meta_charset(self):
345345
""",
346346
"text/html",
347347
)
348-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
348+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
349349

350350
def test_meta_charset_underscores(self):
351351
"""A character encoding contains underscore."""
@@ -358,7 +358,7 @@ def test_meta_charset_underscores(self):
358358
""",
359359
"text/html",
360360
)
361-
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "windows-1252"])
361+
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
362362

363363
def test_xml_encoding(self):
364364
"""A character encoding is found via the meta tag."""
@@ -370,7 +370,7 @@ def test_xml_encoding(self):
370370
""",
371371
"text/html",
372372
)
373-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
373+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
374374

375375
def test_meta_xml_encoding(self):
376376
"""Meta tags take precedence over XML encoding."""
@@ -384,7 +384,7 @@ def test_meta_xml_encoding(self):
384384
""",
385385
"text/html",
386386
)
387-
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "windows-1252"])
387+
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
388388

389389
def test_content_type(self):
390390
"""A character encoding is found via the Content-Type header."""
@@ -399,12 +399,12 @@ def test_content_type(self):
399399
)
400400
for header in headers:
401401
encodings = get_html_media_encodings(b"", header)
402-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
402+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
403403

404404
def test_fallback(self):
405405
"""A character encoding cannot be found in the body or header."""
406406
encodings = get_html_media_encodings(b"", "text/html")
407-
self.assertEqual(list(encodings), ["utf-8", "windows-1252"])
407+
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
408408

409409
def test_duplicates(self):
410410
"""Ensure each encoding is only attempted once."""
@@ -418,4 +418,17 @@ def test_duplicates(self):
418418
""",
419419
'text/html; charset="UTF_8"',
420420
)
421-
self.assertEqual(list(encodings), ["utf-8", "windows-1252"])
421+
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
422+
423+
def test_unknown_invalid(self):
424+
"""A character encoding should be ignored if it is unknown or invalid."""
425+
encodings = get_html_media_encodings(
426+
b"""
427+
<html>
428+
<head><meta charset="invalid">
429+
</head>
430+
</html>
431+
""",
432+
'text/html; charset="invalid"',
433+
)
434+
self.assertEqual(list(encodings), ["utf-8", "cp1252"])

0 commit comments

Comments
 (0)