|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 | import logging
|
| 15 | +import urllib.parse |
15 | 16 | from typing import TYPE_CHECKING, Optional
|
16 | 17 |
|
17 | 18 | import attr
|
18 | 19 |
|
19 | 20 | from synapse.http.client import SimpleHttpClient
|
| 21 | +from synapse.types import JsonDict |
| 22 | +from synapse.util import json_decoder |
20 | 23 |
|
21 | 24 | if TYPE_CHECKING:
|
22 | 25 | from synapse.server import HomeServer
|
23 | 26 |
|
24 | 27 | logger = logging.getLogger(__name__)
|
25 | 28 |
|
26 | 29 |
|
27 |
| -@attr.s(slots=True, auto_attribs=True) |
| 30 | +@attr.s(slots=True, frozen=True, auto_attribs=True) |
28 | 31 | class OEmbedResult:
|
29 |
| - # Either HTML content or URL must be provided. |
30 |
| - html: Optional[str] |
31 |
| - url: Optional[str] |
32 |
| - title: Optional[str] |
33 |
| - # Number of seconds to cache the content. |
34 |
| - cache_age: int |
35 |
| - |
36 |
| - |
37 |
| -class OEmbedError(Exception): |
38 |
| - """An error occurred processing the oEmbed object.""" |
| 32 | + # The Open Graph result (converted from the oEmbed result). |
| 33 | + open_graph_result: JsonDict |
| 34 | + # Number of seconds to cache the content, according to the oEmbed response. |
| 35 | + # |
| 36 | + # This will be None if no cache-age is provided in the oEmbed response (or |
| 37 | + # if the oEmbed response cannot be turned into an Open Graph response). |
| 38 | + cache_age: Optional[int] |
39 | 39 |
|
40 | 40 |
|
41 | 41 | class OEmbedProvider:
|
@@ -81,75 +81,106 @@ def get_oembed_url(self, url: str) -> Optional[str]:
|
81 | 81 | """
|
82 | 82 | for url_pattern, endpoint in self._oembed_patterns.items():
|
83 | 83 | if url_pattern.fullmatch(url):
|
84 |
| - return endpoint |
| 84 | + # TODO Specify max height / width. |
| 85 | + |
| 86 | + # Note that only the JSON format is supported, some endpoints want |
| 87 | + # this in the URL, others want it as an argument. |
| 88 | + endpoint = endpoint.replace("{format}", "json") |
| 89 | + |
| 90 | + args = {"url": url, "format": "json"} |
| 91 | + query_str = urllib.parse.urlencode(args, True) |
| 92 | + return f"{endpoint}?{query_str}" |
85 | 93 |
|
86 | 94 | # No match.
|
87 | 95 | return None
|
88 | 96 |
|
89 |
| - async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: |
| 97 | + def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: |
90 | 98 | """
|
91 |
| - Request content from an oEmbed endpoint. |
| 99 | + Parse the oEmbed response into an Open Graph response. |
92 | 100 |
|
93 | 101 | Args:
|
94 |
| - endpoint: The oEmbed API endpoint. |
95 |
| - url: The URL to pass to the API. |
| 102 | + url: The URL which is being previewed (not the one which was |
| 103 | + requested). |
| 104 | + raw_body: The oEmbed response as JSON encoded as bytes. |
96 | 105 |
|
97 | 106 | Returns:
|
98 |
| - An object representing the metadata returned. |
99 |
| -
|
100 |
| - Raises: |
101 |
| - OEmbedError if fetching or parsing of the oEmbed information fails. |
| 107 | + json-encoded Open Graph data |
102 | 108 | """
|
103 |
| - try: |
104 |
| - logger.debug("Trying to get oEmbed content for url '%s'", url) |
105 | 109 |
|
106 |
| - # Note that only the JSON format is supported, some endpoints want |
107 |
| - # this in the URL, others want it as an argument. |
108 |
| - endpoint = endpoint.replace("{format}", "json") |
109 |
| - |
110 |
| - result = await self._client.get_json( |
111 |
| - endpoint, |
112 |
| - # TODO Specify max height / width. |
113 |
| - args={"url": url, "format": "json"}, |
114 |
| - ) |
| 110 | + try: |
| 111 | + # oEmbed responses *must* be UTF-8 according to the spec. |
| 112 | + oembed = json_decoder.decode(raw_body.decode("utf-8")) |
115 | 113 |
|
116 | 114 | # Ensure there's a version of 1.0.
|
117 |
| - if result.get("version") != "1.0": |
118 |
| - raise OEmbedError("Invalid version: %s" % (result.get("version"),)) |
119 |
| - |
120 |
| - oembed_type = result.get("type") |
| 115 | + oembed_version = oembed["version"] |
| 116 | + if oembed_version != "1.0": |
| 117 | + raise RuntimeError(f"Invalid version: {oembed_version}") |
121 | 118 |
|
122 | 119 | # Ensure the cache age is None or an int.
|
123 |
| - cache_age = result.get("cache_age") |
| 120 | + cache_age = oembed.get("cache_age") |
124 | 121 | if cache_age:
|
125 | 122 | cache_age = int(cache_age)
|
126 | 123 |
|
127 |
| - oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) |
| 124 | + # The results. |
| 125 | + open_graph_response = {"og:title": oembed.get("title")} |
128 | 126 |
|
129 |
| - # HTML content. |
| 127 | + # If a thumbnail exists, use it. Note that dimensions will be calculated later. |
| 128 | + if "thumbnail_url" in oembed: |
| 129 | + open_graph_response["og:image"] = oembed["thumbnail_url"] |
| 130 | + |
| 131 | + # Process each type separately. |
| 132 | + oembed_type = oembed["type"] |
130 | 133 | if oembed_type == "rich":
|
131 |
| - oembed_result.html = result.get("html") |
132 |
| - return oembed_result |
| 134 | + calc_description_and_urls(open_graph_response, oembed["html"]) |
133 | 135 |
|
134 |
| - if oembed_type == "photo": |
135 |
| - oembed_result.url = result.get("url") |
136 |
| - return oembed_result |
| 136 | + elif oembed_type == "photo": |
| 137 | + # If this is a photo, use the full image, not the thumbnail. |
| 138 | + open_graph_response["og:image"] = oembed["url"] |
137 | 139 |
|
138 |
| - # TODO Handle link and video types. |
| 140 | + else: |
| 141 | + raise RuntimeError(f"Unknown oEmbed type: {oembed_type}") |
139 | 142 |
|
140 |
| - if "thumbnail_url" in result: |
141 |
| - oembed_result.url = result.get("thumbnail_url") |
142 |
| - return oembed_result |
| 143 | + except Exception as e: |
| 144 | + # Trap any exception and let the code follow as usual. |
| 145 | + logger.warning(f"Error parsing oEmbed metadata from {url}: {e:r}") |
| 146 | + open_graph_response = {} |
| 147 | + cache_age = None |
143 | 148 |
|
144 |
| - raise OEmbedError("Incompatible oEmbed information.") |
| 149 | + return OEmbedResult(open_graph_response, cache_age) |
145 | 150 |
|
146 |
| - except OEmbedError as e: |
147 |
| - # Trap OEmbedErrors first so we can directly re-raise them. |
148 |
| - logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) |
149 |
| - raise |
150 | 151 |
|
151 |
| - except Exception as e: |
152 |
| - # Trap any exception and let the code follow as usual. |
153 |
| - # FIXME: pass through 404s and other error messages nicely |
154 |
| - logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) |
155 |
| - raise OEmbedError() from e |
| 152 | +def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: |
| 153 | + """ |
| 154 | + Calculate description for an HTML document. |
| 155 | +
|
| 156 | + This uses lxml to convert the HTML document into plaintext. If errors |
| 157 | + occur during processing of the document, an empty response is returned. |
| 158 | +
|
| 159 | + Args: |
| 160 | + open_graph_response: The current Open Graph summary. This is updated with additional fields. |
| 161 | + html_body: The HTML document, as bytes. |
| 162 | +
|
| 163 | + Returns: |
| 164 | + The summary |
| 165 | + """ |
| 166 | + # If there's no body, nothing useful is going to be found. |
| 167 | + if not html_body: |
| 168 | + return |
| 169 | + |
| 170 | + from lxml import etree |
| 171 | + |
| 172 | + # Create an HTML parser. If this fails, log and return no metadata. |
| 173 | + parser = etree.HTMLParser(recover=True, encoding="utf-8") |
| 174 | + |
| 175 | + # Attempt to parse the body. If this fails, log and return no metadata. |
| 176 | + tree = etree.fromstring(html_body, parser) |
| 177 | + |
| 178 | + # The data was successfully parsed, but no tree was found. |
| 179 | + if tree is None: |
| 180 | + return |
| 181 | + |
| 182 | + from synapse.rest.media.v1.preview_url_resource import _calc_description |
| 183 | + |
| 184 | + description = _calc_description(tree) |
| 185 | + if description: |
| 186 | + open_graph_response["og:description"] = description |
0 commit comments