Skip to content

[BUG] Google News link schema changed? #645

Open
@moehmeni

Description

@moehmeni

For decoding Google News URLs into their real ones, I am getting error

import base64
import re

# Some url encoding related constants
_ENCODED_URL_PREFIX = "https://news.google.com/rss/articles/"
_ENCODED_URL_PREFIX_WITH_CONSENT = (
    "https://consent.google.com/m?continue=https://news.google.com/rss/articles/"
)
_ENCODED_URL_RE = re.compile(
    rf"^{re.escape(_ENCODED_URL_PREFIX_WITH_CONSENT)}(?P<encoded_url>[^?]+)"
)
_ENCODED_URL_RE = re.compile(
    rf"^{re.escape(_ENCODED_URL_PREFIX)}(?P<encoded_url>[^?]+)"
)
_DECODED_URL_RE = re.compile(rb'^\x08\x13".+?(?P<primary_url>http[^\xd2]+)\xd2\x01')


def prepare_gnews_url(url):
    # There seems to be a case when we get a URL with consent.google.com
    # see https://github.com/ranahaani/GNews/issues/62
    # Also, the URL is directly decoded, no need to go through news.google.com

    match = _ENCODED_URL_RE.match(url)
    encoded_text = match.groupdict()["encoded_url"]
    # Fix incorrect padding. Ref: https://stackoverflow.com/a/49459036/
    encoded_text += "==="
    decoded_text = base64.urlsafe_b64decode(encoded_text)

    match = _DECODED_URL_RE.match(decoded_text)

    primary_url = match.groupdict()["primary_url"]
    primary_url = primary_url.decode()
    return primary_url


# Test the function
url = "https://news.google.com/rss/articles/CBMi2AFBVV95cUxQOHZlbFBOSXZDQTVDNWhibW9nMlUzaWpfbVRZaTNKMXd4VFNtQ2YxQWt2UmtDbHdia2xvbHZDMU03eXVabzFscDdMcHV4aGFnNW1zdU9zakVyaEFmMm1FVDVBRVotdktTbkJBOUFrT3dwNTY5bVNzZWRJQk1RT3l5SnBBeWdXS1laeVpwejQzN3luZjgwVjN0bFB5NkZSM2oxRXJ6Q0ItbDNMUDZJRTdEZXhjbUV1Z3NYMHdXV1hKV3N3YndWOVZjVE9uZlBGNkk0SS1mbTZ3b0Q?oc=5"
result = prepare_gnews_url(url)
print("Result:", result)
AttributeError: 'NoneType' object has no attribute 'groupdict'

I think they changed recently while it was working before.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions