Move URL parsing functions to their own module (#1360)

aio-libs · Oct 21, 2024 · 55967c2 · 55967c2
1 parent e7bea7e
commit 55967c2
Show file tree

Hide file tree

Showing 2 changed files with 208 additions and 205 deletions.
diff --git a/yarl/_parse.py b/yarl/_parse.py
@@ -0,0 +1,188 @@
+"""URL parsing utilities."""
+
+import re
+import unicodedata
+from functools import lru_cache
+from typing import Union
+from urllib.parse import SplitResult, scheme_chars, uses_netloc
+
+from ._quoters import QUOTER
+
+# Leading and trailing C0 control and space to be stripped per WHATWG spec.
+# == "".join([chr(i) for i in range(0, 0x20 + 1)])
+WHATWG_C0_CONTROL_OR_SPACE = (
+    "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
+    "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "
+)
+
+# Unsafe bytes to be removed per WHATWG spec
+UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]
+USES_AUTHORITY = frozenset(uses_netloc)
+
+
+@lru_cache
+def split_url(url: str) -> SplitResult:
+    """Split URL into parts."""
+    # Adapted from urllib.parse.urlsplit
+    # Only lstrip url as some applications rely on preserving trailing space.
+    # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
+    url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)
+    for b in UNSAFE_URL_BYTES_TO_REMOVE:
+        if b in url:
+            url = url.replace(b, "")
+
+    scheme = netloc = query = fragment = ""
+    i = url.find(":")
+    if i > 0 and url[0] in scheme_chars:
+        for c in url[1:i]:
+            if c not in scheme_chars:
+                break
+        else:
+            scheme, url = url[:i].lower(), url[i + 1 :]
+    has_hash = "#" in url
+    has_question_mark = "?" in url
+    if url[:2] == "//":
+        delim = len(url)  # position of end of domain part of url, default is end
+        if has_hash and has_question_mark:
+            delim_chars = "/?#"
+        elif has_question_mark:
+            delim_chars = "/?"
+        elif has_hash:
+            delim_chars = "/#"
+        else:
+            delim_chars = "/"
+        for c in delim_chars:  # look for delimiters; the order is NOT important
+            wdelim = url.find(c, 2)  # find first of this delim
+            if wdelim >= 0 and wdelim < delim:  # if found
+                delim = wdelim  # use earliest delim position
+        netloc = url[2:delim]
+        url = url[delim:]
+        has_left_bracket = "[" in netloc
+        has_right_bracket = "]" in netloc
+        if (has_left_bracket and not has_right_bracket) or (
+            has_right_bracket and not has_left_bracket
+        ):
+            raise ValueError("Invalid IPv6 URL")
+        if has_left_bracket:
+            bracketed_host = netloc.partition("[")[2].partition("]")[0]
+            # Valid bracketed hosts are defined in
+            # https://www.rfc-editor.org/rfc/rfc3986#page-49
+            # https://url.spec.whatwg.org/
+            if bracketed_host[0] == "v":
+                if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):
+                    raise ValueError("IPvFuture address is invalid")
+            elif ":" not in bracketed_host:
+                raise ValueError("An IPv4 address cannot be in brackets")
+    if has_hash:
+        url, _, fragment = url.partition("#")
+    if has_question_mark:
+        url, _, query = url.partition("?")
+    if netloc and not netloc.isascii():
+        _check_netloc(netloc)
+    return tuple.__new__(SplitResult, (scheme, netloc, url, query, fragment))
+
+
+def _check_netloc(netloc: str) -> None:
+    # Adapted from urllib.parse._checknetloc
+    # looking for characters like \u2100 that expand to 'a/c'
+    # IDNA uses NFKC equivalence, so normalize for this check
+
+    # ignore characters already included
+    # but not the surrounding text
+    n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")
+    normalized_netloc = unicodedata.normalize("NFKC", n)
+    if n == normalized_netloc:
+        return
+    # Note that there are no unicode decompositions for the character '@' so
+    # its currently impossible to have test coverage for this branch, however if the
+    # one should be added in the future we want to make sure its still checked.
+    for c in "/?#@:":  # pragma: no branch
+        if c in normalized_netloc:
+            raise ValueError(
+                f"netloc '{netloc}' contains invalid "
+                "characters under NFKC normalization"
+            )
+
+
+@lru_cache  # match the same size as urlsplit
+def split_netloc(
+    netloc: str,
+) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]:
+    """Split netloc into username, password, host and port."""
+    if "@" not in netloc:
+        username: Union[str, None] = None
+        password: Union[str, None] = None
+        hostinfo = netloc
+    else:
+        userinfo, _, hostinfo = netloc.rpartition("@")
+        username, have_password, password = userinfo.partition(":")
+        if not have_password:
+            password = None
+
+    if "[" in hostinfo:
+        _, _, bracketed = hostinfo.partition("[")
+        hostname, _, port_str = bracketed.partition("]")
+        _, _, port_str = port_str.partition(":")
+    else:
+        hostname, _, port_str = hostinfo.partition(":")
+
+    if not port_str:
+        return username or None, password, hostname or None, None
+
+    try:
+        port = int(port_str)
+    except ValueError:
+        raise ValueError("Invalid URL: port can't be converted to integer")
+    if not (0 <= port <= 65535):
+        raise ValueError("Port out of range 0-65535")
+    return username or None, password, hostname or None, port
+
+
+def unsplit_result(
+    scheme: str, netloc: str, url: str, query: str, fragment: str
+) -> str:
+    """Unsplit a URL without any normalization."""
+    if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":
+        if url and url[:1] != "/":
+            url = f"//{netloc or ''}/{url}"
+        else:
+            url = f"//{netloc or ''}{url}"
+    if scheme:
+        url = f"{scheme}:{url}"
+    if query:
+        url = f"{url}?{query}"
+    return f"{url}#{fragment}" if fragment else url
+
+
+@lru_cache  # match the same size as urlsplit
+def make_netloc(
+    user: Union[str, None],
+    password: Union[str, None],
+    host: Union[str, None],
+    port: Union[int, None],
+    encode: bool = False,
+) -> str:
+    """Make netloc from parts.
+
+    The user and password are encoded if encode is True.
+
+    The host must already be encoded with _encode_host.
+    """
+    if host is None:
+        return ""
+    ret = host
+    if port is not None:
+        ret = f"{ret}:{port}"
+    if user is None and password is None:
+        return ret
+    if password is not None:
+        if not user:
+            user = ""
+        elif encode:
+            user = QUOTER(user)
+        if encode:
+            password = QUOTER(password)
+        user = f"{user}:{password}"
+    elif user and encode:
+        user = QUOTER(user)
+    return f"{user}@{ret}" if user else ret