Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Support underscores (in addition to hyphens) for charset detection. (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
srividyut authored Jul 27, 2021
1 parent 5b22d5e commit 8e1febc
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 2 deletions.
1 change: 1 addition & 0 deletions changelog.d/10410.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
6 changes: 4 additions & 2 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@

logger = logging.getLogger(__name__)

_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
_charset_match = re.compile(
br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
)
_xml_encoding_match = re.compile(
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

Expand Down
13 changes: 13 additions & 0 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,19 @@ def test_meta_charset(self):
)
self.assertEqual(encoding, "ascii")

def test_meta_charset_underscores(self):
"""A character encoding contains underscore."""
encoding = get_html_media_encoding(
b"""
<html>
<head><meta charset="Shift_JIS">
</head>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "Shift_JIS")

def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
Expand Down

0 comments on commit 8e1febc

Please sign in to comment.