Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 4587b35

Browse files
authored
Clean-up logic for rebasing URLs during URL preview. (#12219)
By using urljoin from the standard library and reducing the number of places URLs are rebased.
1 parent dda9b7f commit 4587b35

File tree

4 files changed

+26
-91
lines changed

4 files changed

+26
-91
lines changed

changelog.d/12219.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Clean-up logic around rebasing URLs for URL image previews.

synapse/rest/media/v1/preview_html.py

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import logging
1717
import re
1818
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
19-
from urllib import parse as urlparse
2019

2120
if TYPE_CHECKING:
2221
from lxml import etree
@@ -144,9 +143,7 @@ def decode_body(
144143
return etree.fromstring(body, parser)
145144

146145

147-
def parse_html_to_open_graph(
148-
tree: "etree.Element", media_uri: str
149-
) -> Dict[str, Optional[str]]:
146+
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
150147
"""
151148
Parse the HTML document into an Open Graph response.
152149
@@ -155,7 +152,6 @@ def parse_html_to_open_graph(
155152
156153
Args:
157154
tree: The parsed HTML document.
158-
media_url: The URI used to download the body.
159155
160156
Returns:
161157
The Open Graph response as a dictionary.
@@ -209,7 +205,7 @@ def parse_html_to_open_graph(
209205
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
210206
)
211207
if meta_image:
212-
og["og:image"] = rebase_url(meta_image[0], media_uri)
208+
og["og:image"] = meta_image[0]
213209
else:
214210
# TODO: consider inlined CSS styles as well as width & height attribs
215211
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
@@ -320,37 +316,6 @@ def _iterate_over_text(
320316
)
321317

322318

323-
def rebase_url(url: str, base: str) -> str:
324-
"""
325-
Resolves a potentially relative `url` against an absolute `base` URL.
326-
327-
For example:
328-
329-
>>> rebase_url("subpage", "https://example.com/foo/")
330-
'https://example.com/foo/subpage'
331-
>>> rebase_url("sibling", "https://example.com/foo")
332-
'https://example.com/sibling'
333-
>>> rebase_url("/bar", "https://example.com/foo/")
334-
'https://example.com/bar'
335-
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
336-
'https://alice.com/a'
337-
"""
338-
base_parts = urlparse.urlparse(base)
339-
# Convert the parsed URL to a list for (potential) modification.
340-
url_parts = list(urlparse.urlparse(url))
341-
# Add a scheme, if one does not exist.
342-
if not url_parts[0]:
343-
url_parts[0] = base_parts.scheme or "http"
344-
# Fix up the hostname, if this is not a data URL.
345-
if url_parts[0] != "data" and not url_parts[1]:
346-
url_parts[1] = base_parts.netloc
347-
# If the path does not start with a /, nest it under the base path's last
348-
# directory.
349-
if not url_parts[2].startswith("/"):
350-
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
351-
return urlparse.urlunparse(url_parts)
352-
353-
354319
def summarize_paragraphs(
355320
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
356321
) -> Optional[str]:

synapse/rest/media/v1/preview_url_resource.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import sys
2323
import traceback
2424
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
25-
from urllib import parse as urlparse
25+
from urllib.parse import urljoin, urlparse, urlsplit
2626
from urllib.request import urlopen
2727

2828
import attr
@@ -44,11 +44,7 @@
4444
from synapse.rest.media.v1._base import get_filename_from_headers
4545
from synapse.rest.media.v1.media_storage import MediaStorage
4646
from synapse.rest.media.v1.oembed import OEmbedProvider
47-
from synapse.rest.media.v1.preview_html import (
48-
decode_body,
49-
parse_html_to_open_graph,
50-
rebase_url,
51-
)
47+
from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
5248
from synapse.types import JsonDict, UserID
5349
from synapse.util import json_encoder
5450
from synapse.util.async_helpers import ObservableDeferred
@@ -187,7 +183,7 @@ async def _async_render_GET(self, request: SynapseRequest) -> None:
187183
ts = self.clock.time_msec()
188184

189185
# XXX: we could move this into _do_preview if we wanted.
190-
url_tuple = urlparse.urlsplit(url)
186+
url_tuple = urlsplit(url)
191187
for entry in self.url_preview_url_blacklist:
192188
match = True
193189
for attrib in entry:
@@ -322,7 +318,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
322318

323319
# Parse Open Graph information from the HTML in case the oEmbed
324320
# response failed or is incomplete.
325-
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
321+
og_from_html = parse_html_to_open_graph(tree)
326322

327323
# Compile the Open Graph response by using the scraped
328324
# information from the HTML and overlaying any information
@@ -588,12 +584,17 @@ async def _precache_image_url(
588584
if "og:image" not in og or not og["og:image"]:
589585
return
590586

587+
# The image URL from the HTML might be relative to the previewed page,
588+
# convert it to an URL which can be requested directly.
589+
image_url = og["og:image"]
590+
url_parts = urlparse(image_url)
591+
if url_parts.scheme != "data":
592+
image_url = urljoin(media_info.uri, image_url)
593+
591594
# FIXME: it might be cleaner to use the same flow as the main /preview_url
592595
# request itself and benefit from the same caching etc. But for now we
593596
# just rely on the caching on the master request to speed things up.
594-
image_info = await self._handle_url(
595-
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
596-
)
597+
image_info = await self._handle_url(image_url, user, allow_data_urls=True)
597598

598599
if _is_media(image_info.media_type):
599600
# TODO: make sure we don't choke on white-on-transparent images

tests/rest/media/v1/test_html_preview.py

Lines changed: 11 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
_get_html_media_encodings,
1717
decode_body,
1818
parse_html_to_open_graph,
19-
rebase_url,
2019
summarize_paragraphs,
2120
)
2221

@@ -161,7 +160,7 @@ def test_simple(self) -> None:
161160
"""
162161

163162
tree = decode_body(html, "http://example.com/test.html")
164-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
163+
og = parse_html_to_open_graph(tree)
165164

166165
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
167166

@@ -177,7 +176,7 @@ def test_comment(self) -> None:
177176
"""
178177

179178
tree = decode_body(html, "http://example.com/test.html")
180-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
179+
og = parse_html_to_open_graph(tree)
181180

182181
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
183182

@@ -196,7 +195,7 @@ def test_comment2(self) -> None:
196195
"""
197196

198197
tree = decode_body(html, "http://example.com/test.html")
199-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
198+
og = parse_html_to_open_graph(tree)
200199

201200
self.assertEqual(
202201
og,
@@ -218,7 +217,7 @@ def test_script(self) -> None:
218217
"""
219218

220219
tree = decode_body(html, "http://example.com/test.html")
221-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
220+
og = parse_html_to_open_graph(tree)
222221

223222
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
224223

@@ -232,7 +231,7 @@ def test_missing_title(self) -> None:
232231
"""
233232

234233
tree = decode_body(html, "http://example.com/test.html")
235-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
234+
og = parse_html_to_open_graph(tree)
236235

237236
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
238237

@@ -247,7 +246,7 @@ def test_h1_as_title(self) -> None:
247246
"""
248247

249248
tree = decode_body(html, "http://example.com/test.html")
250-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
249+
og = parse_html_to_open_graph(tree)
251250

252251
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
253252

@@ -262,7 +261,7 @@ def test_missing_title_and_broken_h1(self) -> None:
262261
"""
263262

264263
tree = decode_body(html, "http://example.com/test.html")
265-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
264+
og = parse_html_to_open_graph(tree)
266265

267266
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
268267

@@ -290,7 +289,7 @@ def test_xml(self) -> None:
290289
<head><title>Foo</title></head><body>Some text.</body></html>
291290
""".strip()
292291
tree = decode_body(html, "http://example.com/test.html")
293-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
292+
og = parse_html_to_open_graph(tree)
294293
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
295294

296295
def test_invalid_encoding(self) -> None:
@@ -304,7 +303,7 @@ def test_invalid_encoding(self) -> None:
304303
</html>
305304
"""
306305
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
307-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
306+
og = parse_html_to_open_graph(tree)
308307
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
309308

310309
def test_invalid_encoding2(self) -> None:
@@ -319,7 +318,7 @@ def test_invalid_encoding2(self) -> None:
319318
</html>
320319
"""
321320
tree = decode_body(html, "http://example.com/test.html")
322-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
321+
og = parse_html_to_open_graph(tree)
323322
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
324323

325324
def test_windows_1252(self) -> None:
@@ -333,7 +332,7 @@ def test_windows_1252(self) -> None:
333332
</html>
334333
"""
335334
tree = decode_body(html, "http://example.com/test.html")
336-
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
335+
og = parse_html_to_open_graph(tree)
337336
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
338337

339338

@@ -448,34 +447,3 @@ def test_unknown_invalid(self) -> None:
448447
'text/html; charset="invalid"',
449448
)
450449
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
451-
452-
453-
class RebaseUrlTestCase(unittest.TestCase):
454-
def test_relative(self) -> None:
455-
"""Relative URLs should be resolved based on the context of the base URL."""
456-
self.assertEqual(
457-
rebase_url("subpage", "https://example.com/foo/"),
458-
"https://example.com/foo/subpage",
459-
)
460-
self.assertEqual(
461-
rebase_url("sibling", "https://example.com/foo"),
462-
"https://example.com/sibling",
463-
)
464-
self.assertEqual(
465-
rebase_url("/bar", "https://example.com/foo/"),
466-
"https://example.com/bar",
467-
)
468-
469-
def test_absolute(self) -> None:
470-
"""Absolute URLs should not be modified."""
471-
self.assertEqual(
472-
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
473-
"https://alice.com/a/",
474-
)
475-
476-
def test_data(self) -> None:
477-
"""Data URLs should not be modified."""
478-
self.assertEqual(
479-
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
480-
"data:,Hello%2C%20World%21",
481-
)

0 commit comments

Comments
 (0)