@@ -95,9 +95,29 @@ def safe_url_string( # pylint: disable=too-many-locals
95
95
path_encoding : str = "utf8" ,
96
96
quote_path : bool = True ,
97
97
) -> str :
98
- """Convert the given URL into a legal URL by escaping unsafe characters
99
- according to RFC-3986. Also, ASCII tabs and newlines are removed
100
- as per https://url.spec.whatwg.org/#url-parsing.
98
+ """Return a URL equivalent to *url* that a wide range of web browsers and
99
+ web servers consider valid.
100
+
101
+ *url* is parsed according to the rules of the `URL living standard`_,
102
+ and during serialization additional characters are percent-encoded to make
103
+ the URL valid by additional URL standards.
104
+
105
+ .. _URL living standard: https://url.spec.whatwg.org/
106
+
107
+ The returned URL is valid by *all* of the following URL standards known to
108
+ be enforced by modern-day web browsers and web servers:
109
+
110
+ - `URL living standard`_
111
+
112
+ - `RFC 3986`_
113
+
114
+ - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI
115
+ class`_.
116
+
117
+ .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html
118
+ .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt
119
+ .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt
120
+ .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt
101
121
102
122
If a bytes URL is given, it is first converted to `str` using the given
103
123
encoding (which defaults to 'utf-8'). If quote_path is True (default),
@@ -111,10 +131,8 @@ def safe_url_string( # pylint: disable=too-many-locals
111
131
112
132
Calling this function on an already "safe" URL will return the URL
113
133
unmodified.
114
-
115
- Always returns a native `str` (bytes in Python2, unicode in Python3).
116
134
"""
117
- # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
135
+ # urlsplit() chokes on bytes input with non-ASCII chars,
118
136
# so let's decode (to Unicode) using page encoding:
119
137
# - it is assumed that a raw bytes input comes from a document
120
138
# encoded with the supplied encoding (or UTF8 by default)
@@ -538,11 +556,8 @@ def canonicalize_url(
538
556
) -> str :
539
557
r"""Canonicalize the given url by applying the following procedures:
540
558
559
+ - make the URL safe (see :func:`safe_url_string`)
541
560
- sort query arguments, first by key, then by value
542
- - percent encode paths ; non-ASCII characters are percent-encoded
543
- using UTF-8 (RFC-3986)
544
- - percent encode query arguments ; non-ASCII characters are percent-encoded
545
- using passed `encoding` (UTF-8 by default)
546
561
- normalize all spaces (in query arguments) '+' (plus symbol)
547
562
- normalize percent encodings case (%2f -> %2F)
548
563
- remove query arguments with blank values (unless `keep_blank_values` is True)
0 commit comments