diff --git a/plugins/Web/plugin.py b/plugins/Web/plugin.py index c56da8535..c4d1dfbe5 100644 --- a/plugins/Web/plugin.py +++ b/plugins/Web/plugin.py @@ -150,22 +150,47 @@ def noIgnore(self, irc, msg): def getTitle(self, irc, url, raiseErrors, msg): size = conf.supybot.protocols.http.peekSize() - parsed_url = utils.web.urlparse(url) - if parsed_url.netloc in ('youtube.com', 'youtu.be') \ - or parsed_url.netloc.endswith(('.youtube.com')): - # there is a lot of Javascript before the - size = max(819200, size) - if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'): - # Since 2022-03, New Reddit has 'Reddit - Dive into anything' as - # <title> on every page. - parsed_url = parsed_url._replace(netloc='old.reddit.com') - url = utils.web.urlunparse(parsed_url) - + def url_workaround(url): + """Returns a new URL that should be the target of a new request, + or None if the request is fine as it is. + + The returned URL may be the same as the parameter, in case + something else was changed by this function through side-effects. + """ + nonlocal size + parsed_url = utils.web.urlparse(url) + print(repr(parsed_url.netloc)) + if parsed_url.netloc in ('youtube.com', 'youtu.be') \ + or parsed_url.netloc.endswith(('.youtube.com')): + # there is a lot of Javascript before the <title> + if size < 819200: + size = max(819200, size) + return url + else: + return None + if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'): + # Since 2022-03, New Reddit has 'Reddit - Dive into anything' as + # <title> on every page. + parsed_url = parsed_url._replace(netloc='old.reddit.com') + url = utils.web.urlunparse(parsed_url) + self.log.debug("Rewrite URL to %s", url) + return url + + return None + + url = url_workaround(url) or url timeout = self.registryValue('timeout') headers = conf.defaultHttpHeaders(irc.network, msg.channel) try: fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers) target = fd.geturl() + fixed_target = url_workaround(target) + if fixed_target is not None: + # happens when using minification services linking to one of + # the websites handled by url_workaround; eg. v.redd.it + fd.close() + fd = utils.web.getUrlFd(fixed_target, timeout=timeout, headers=headers) + target = fd.geturl() text = fd.read(size) response_headers = fd.headers fd.close() diff --git a/plugins/Web/test.py b/plugins/Web/test.py index e8ecdff33..b35ceb5e7 100644 --- a/plugins/Web/test.py +++ b/plugins/Web/test.py @@ -84,6 +84,9 @@ def testtitleReddit(self): self.assertRegexp( 'title https://www.reddit.com/r/irc/', 'Internet Relay Chat') + self.assertRegexp( + 'title https://v.redd.it/odhemxo6giud1', + 'Small Kitty Big Goals : MadeMeSmile') def testTitleMarcinfo(self): # Checks that we don't crash on 'Content-Type: text/html;'