Skip to content

Commit fc897fc

Browse files
gh-76960: Fix urljoin() and urldefrag() for URIs with empty components (GH-123273)
* urljoin() with relative reference "?" sets empty query and removes fragment. * Preserve empty components (authority, params, query, fragment) in urljoin(). * Preserve empty components (authority, params, query) in urldefrag(). Also refactor the code and get rid of double _coerce_args() and _coerce_result() calls in urljoin(), urldefrag(), urlparse() and urlunparse().
1 parent e5a567b commit fc897fc

File tree

3 files changed

+140
-52
lines changed

3 files changed

+140
-52
lines changed

Lib/test/test_urlparse.py

Lines changed: 73 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -349,18 +349,19 @@ def _encode(t):
349349
split = (scheme,) + split
350350
self.checkRoundtrips(url, parsed, split)
351351

352-
def checkJoin(self, base, relurl, expected):
352+
def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
353353
with self.subTest(base=base, relurl=relurl):
354354
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
355355
baseb = base.encode('ascii')
356356
relurlb = relurl.encode('ascii')
357357
expectedb = expected.encode('ascii')
358358
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
359359

360-
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl))
361-
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
362-
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
363-
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
360+
if relroundtrip:
361+
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl))
362+
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
363+
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
364+
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
364365

365366
def test_unparse_parse(self):
366367
str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]
@@ -526,8 +527,6 @@ def test_RFC3986(self):
526527

527528
def test_urljoins(self):
528529
self.checkJoin(SIMPLE_BASE, 'g:h','g:h')
529-
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
530-
self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d')
531530
self.checkJoin(SIMPLE_BASE, 'g','http://a/b/c/g')
532531
self.checkJoin(SIMPLE_BASE, './g','http://a/b/c/g')
533532
self.checkJoin(SIMPLE_BASE, 'g/','http://a/b/c/g/')
@@ -548,8 +547,6 @@ def test_urljoins(self):
548547
self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
549548
self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
550549
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
551-
self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d')
552-
self.checkJoin(SIMPLE_BASE, 'http:?y','http://a/b/c/d?y')
553550
self.checkJoin(SIMPLE_BASE, 'http:g?y','http://a/b/c/g?y')
554551
self.checkJoin(SIMPLE_BASE, 'http:g?y/./x','http://a/b/c/g?y/./x')
555552
self.checkJoin('http:///', '..','http:///')
@@ -579,6 +576,53 @@ def test_urljoins(self):
579576
# issue 23703: don't duplicate filename
580577
self.checkJoin('a', 'b', 'b')
581578

579+
# Test with empty (but defined) components.
580+
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
581+
self.checkJoin(RFC1808_BASE, '#', 'http://a/b/c/d;p?q#', relroundtrip=False)
582+
self.checkJoin(RFC1808_BASE, '#z', 'http://a/b/c/d;p?q#z')
583+
self.checkJoin(RFC1808_BASE, '?', 'http://a/b/c/d;p?', relroundtrip=False)
584+
self.checkJoin(RFC1808_BASE, '?#z', 'http://a/b/c/d;p?#z', relroundtrip=False)
585+
self.checkJoin(RFC1808_BASE, '?y', 'http://a/b/c/d;p?y')
586+
self.checkJoin(RFC1808_BASE, ';', 'http://a/b/c/;')
587+
self.checkJoin(RFC1808_BASE, ';?y', 'http://a/b/c/;?y')
588+
self.checkJoin(RFC1808_BASE, ';#z', 'http://a/b/c/;#z')
589+
self.checkJoin(RFC1808_BASE, ';x', 'http://a/b/c/;x')
590+
self.checkJoin(RFC1808_BASE, '/w', 'http://a/w')
591+
self.checkJoin(RFC1808_BASE, '//', 'http://a/b/c/d;p?q#f')
592+
self.checkJoin(RFC1808_BASE, '//#z', 'http://a/b/c/d;p?q#z')
593+
self.checkJoin(RFC1808_BASE, '//?y', 'http://a/b/c/d;p?y')
594+
self.checkJoin(RFC1808_BASE, '//;x', 'http://;x')
595+
self.checkJoin(RFC1808_BASE, '///w', 'http://a/w')
596+
self.checkJoin(RFC1808_BASE, '//v', 'http://v')
597+
# For backward compatibility with RFC1630, the scheme name is allowed
598+
# to be present in a relative reference if it is the same as the base
599+
# URI scheme.
600+
self.checkJoin(RFC1808_BASE, 'http:', 'http://a/b/c/d;p?q#f')
601+
self.checkJoin(RFC1808_BASE, 'http:#', 'http://a/b/c/d;p?q#', relroundtrip=False)
602+
self.checkJoin(RFC1808_BASE, 'http:#z', 'http://a/b/c/d;p?q#z')
603+
self.checkJoin(RFC1808_BASE, 'http:?', 'http://a/b/c/d;p?', relroundtrip=False)
604+
self.checkJoin(RFC1808_BASE, 'http:?#z', 'http://a/b/c/d;p?#z', relroundtrip=False)
605+
self.checkJoin(RFC1808_BASE, 'http:?y', 'http://a/b/c/d;p?y')
606+
self.checkJoin(RFC1808_BASE, 'http:;', 'http://a/b/c/;')
607+
self.checkJoin(RFC1808_BASE, 'http:;?y', 'http://a/b/c/;?y')
608+
self.checkJoin(RFC1808_BASE, 'http:;#z', 'http://a/b/c/;#z')
609+
self.checkJoin(RFC1808_BASE, 'http:;x', 'http://a/b/c/;x')
610+
self.checkJoin(RFC1808_BASE, 'http:/w', 'http://a/w')
611+
self.checkJoin(RFC1808_BASE, 'http://', 'http://a/b/c/d;p?q#f')
612+
self.checkJoin(RFC1808_BASE, 'http://#z', 'http://a/b/c/d;p?q#z')
613+
self.checkJoin(RFC1808_BASE, 'http://?y', 'http://a/b/c/d;p?y')
614+
self.checkJoin(RFC1808_BASE, 'http://;x', 'http://;x')
615+
self.checkJoin(RFC1808_BASE, 'http:///w', 'http://a/w')
616+
self.checkJoin(RFC1808_BASE, 'http://v', 'http://v')
617+
# Different scheme is not ignored.
618+
self.checkJoin(RFC1808_BASE, 'https:', 'https:', relroundtrip=False)
619+
self.checkJoin(RFC1808_BASE, 'https:#', 'https:#', relroundtrip=False)
620+
self.checkJoin(RFC1808_BASE, 'https:#z', 'https:#z', relroundtrip=False)
621+
self.checkJoin(RFC1808_BASE, 'https:?', 'https:?', relroundtrip=False)
622+
self.checkJoin(RFC1808_BASE, 'https:?y', 'https:?y', relroundtrip=False)
623+
self.checkJoin(RFC1808_BASE, 'https:;', 'https:;')
624+
self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x')
625+
582626
def test_RFC2732(self):
583627
str_cases = [
584628
('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
@@ -641,16 +685,31 @@ def test_urldefrag(self):
641685
('http://python.org/p?q', 'http://python.org/p?q', ''),
642686
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
643687
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
688+
('http://a/b/c;p?q#f', 'http://a/b/c;p?q', 'f'),
689+
('http://a/b/c;p?q#', 'http://a/b/c;p?q', ''),
690+
('http://a/b/c;p?q', 'http://a/b/c;p?q', ''),
691+
('http://a/b/c;p?#f', 'http://a/b/c;p?', 'f'),
692+
('http://a/b/c;p#f', 'http://a/b/c;p', 'f'),
693+
('http://a/b/c;?q#f', 'http://a/b/c;?q', 'f'),
694+
('http://a/b/c?q#f', 'http://a/b/c?q', 'f'),
695+
('http:///b/c;p?q#f', 'http:///b/c;p?q', 'f'),
696+
('http:b/c;p?q#f', 'http:b/c;p?q', 'f'),
697+
('http:;?q#f', 'http:;?q', 'f'),
698+
('http:?q#f', 'http:?q', 'f'),
699+
('//a/b/c;p?q#f', '//a/b/c;p?q', 'f'),
700+
('://a/b/c;p?q#f', '://a/b/c;p?q', 'f'),
644701
]
645702
def _encode(t):
646703
return type(t)(x.encode('ascii') for x in t)
647704
bytes_cases = [_encode(x) for x in str_cases]
648705
for url, defrag, frag in str_cases + bytes_cases:
649-
result = urllib.parse.urldefrag(url)
650-
self.assertEqual(result.geturl(), url)
651-
self.assertEqual(result, (defrag, frag))
652-
self.assertEqual(result.url, defrag)
653-
self.assertEqual(result.fragment, frag)
706+
with self.subTest(url):
707+
result = urllib.parse.urldefrag(url)
708+
hash = '#' if isinstance(url, str) else b'#'
709+
self.assertEqual(result.geturl(), url.rstrip(hash))
710+
self.assertEqual(result, (defrag, frag))
711+
self.assertEqual(result.url, defrag)
712+
self.assertEqual(result.fragment, frag)
654713

655714
def test_urlsplit_scoped_IPv6(self):
656715
p = urllib.parse.urlsplit('http://[FE80::822a:a8ff:fe49:470c%tESt]:1234')

Lib/urllib/parse.py

Lines changed: 62 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -392,20 +392,23 @@ def urlparse(url, scheme='', allow_fragments=True):
392392
Note that % escapes are not expanded.
393393
"""
394394
url, scheme, _coerce_result = _coerce_args(url, scheme)
395-
splitresult = urlsplit(url, scheme, allow_fragments)
396-
scheme, netloc, url, query, fragment = splitresult
397-
if scheme in uses_params and ';' in url:
398-
url, params = _splitparams(url)
399-
else:
400-
params = ''
401-
result = ParseResult(scheme, netloc, url, params, query, fragment)
395+
scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments)
396+
result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '')
402397
return _coerce_result(result)
403398

404-
def _splitparams(url):
399+
def _urlparse(url, scheme=None, allow_fragments=True):
400+
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
401+
if (scheme or '') in uses_params and ';' in url:
402+
url, params = _splitparams(url, allow_none=True)
403+
else:
404+
params = None
405+
return (scheme, netloc, url, params, query, fragment)
406+
407+
def _splitparams(url, allow_none=False):
405408
if '/' in url:
406409
i = url.find(';', url.rfind('/'))
407410
if i < 0:
408-
return url, ''
411+
return url, None if allow_none else ''
409412
else:
410413
i = url.find(';')
411414
return url[:i], url[i+1:]
@@ -472,17 +475,23 @@ def urlsplit(url, scheme='', allow_fragments=True):
472475
"""
473476

474477
url, scheme, _coerce_result = _coerce_args(url, scheme)
478+
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
479+
v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '')
480+
return _coerce_result(v)
481+
482+
def _urlsplit(url, scheme=None, allow_fragments=True):
475483
# Only lstrip url as some applications rely on preserving trailing space.
476484
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
477485
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
478-
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
479-
480486
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
481487
url = url.replace(b, "")
482-
scheme = scheme.replace(b, "")
488+
if scheme is not None:
489+
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
490+
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
491+
scheme = scheme.replace(b, "")
483492

484493
allow_fragments = bool(allow_fragments)
485-
netloc = query = fragment = ''
494+
netloc = query = fragment = None
486495
i = url.find(':')
487496
if i > 0 and url[0].isascii() and url[0].isalpha():
488497
for c in url[:i]:
@@ -503,8 +512,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
503512
if '?' in url:
504513
url, query = url.split('?', 1)
505514
_checknetloc(netloc)
506-
v = SplitResult(scheme, netloc, url, query, fragment)
507-
return _coerce_result(v)
515+
return (scheme, netloc, url, query, fragment)
508516

509517
def urlunparse(components):
510518
"""Put a parsed URL back together again. This may result in a
@@ -513,9 +521,15 @@ def urlunparse(components):
513521
(the draft states that these are equivalent)."""
514522
scheme, netloc, url, params, query, fragment, _coerce_result = (
515523
_coerce_args(*components))
524+
if not netloc:
525+
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
526+
netloc = ''
527+
else:
528+
netloc = None
516529
if params:
517530
url = "%s;%s" % (url, params)
518-
return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
531+
return _coerce_result(_urlunsplit(scheme or None, netloc, url,
532+
query or None, fragment or None))
519533

520534
def urlunsplit(components):
521535
"""Combine the elements of a tuple as returned by urlsplit() into a
@@ -525,20 +539,27 @@ def urlunsplit(components):
525539
empty query; the RFC states that these are equivalent)."""
526540
scheme, netloc, url, query, fragment, _coerce_result = (
527541
_coerce_args(*components))
528-
if netloc:
542+
if not netloc:
543+
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
544+
netloc = ''
545+
else:
546+
netloc = None
547+
return _coerce_result(_urlunsplit(scheme or None, netloc, url,
548+
query or None, fragment or None))
549+
550+
def _urlunsplit(scheme, netloc, url, query, fragment):
551+
if netloc is not None:
529552
if url and url[:1] != '/': url = '/' + url
530553
url = '//' + netloc + url
531554
elif url[:2] == '//':
532555
url = '//' + url
533-
elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
534-
url = '//' + url
535556
if scheme:
536557
url = scheme + ':' + url
537-
if query:
558+
if query is not None:
538559
url = url + '?' + query
539-
if fragment:
560+
if fragment is not None:
540561
url = url + '#' + fragment
541-
return _coerce_result(url)
562+
return url
542563

543564
def urljoin(base, url, allow_fragments=True):
544565
"""Join a base URL and a possibly relative URL to form an absolute
@@ -549,26 +570,29 @@ def urljoin(base, url, allow_fragments=True):
549570
return base
550571

551572
base, url, _coerce_result = _coerce_args(base, url)
552-
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
553-
urlparse(base, '', allow_fragments)
554-
scheme, netloc, path, params, query, fragment = \
555-
urlparse(url, bscheme, allow_fragments)
573+
bscheme, bnetloc, bpath, bquery, bfragment = \
574+
_urlsplit(base, None, allow_fragments)
575+
scheme, netloc, path, query, fragment = \
576+
_urlsplit(url, None, allow_fragments)
556577

578+
if scheme is None:
579+
scheme = bscheme
557580
if scheme != bscheme or scheme not in uses_relative:
558581
return _coerce_result(url)
559582
if scheme in uses_netloc:
560583
if netloc:
561-
return _coerce_result(urlunparse((scheme, netloc, path,
562-
params, query, fragment)))
584+
return _coerce_result(_urlunsplit(scheme, netloc, path,
585+
query, fragment))
563586
netloc = bnetloc
564587

565-
if not path and not params:
588+
if not path:
566589
path = bpath
567-
params = bparams
568-
if not query:
590+
if query is None:
569591
query = bquery
570-
return _coerce_result(urlunparse((scheme, netloc, path,
571-
params, query, fragment)))
592+
if fragment is None:
593+
fragment = bfragment
594+
return _coerce_result(_urlunsplit(scheme, netloc, path,
595+
query, fragment))
572596

573597
base_parts = bpath.split('/')
574598
if base_parts[-1] != '':
@@ -605,8 +629,8 @@ def urljoin(base, url, allow_fragments=True):
605629
# then we need to append the trailing '/'
606630
resolved_path.append('')
607631

608-
return _coerce_result(urlunparse((scheme, netloc, '/'.join(
609-
resolved_path) or '/', params, query, fragment)))
632+
return _coerce_result(_urlunsplit(scheme, netloc, '/'.join(
633+
resolved_path) or '/', query, fragment))
610634

611635

612636
def urldefrag(url):
@@ -618,12 +642,12 @@ def urldefrag(url):
618642
"""
619643
url, _coerce_result = _coerce_args(url)
620644
if '#' in url:
621-
s, n, p, a, q, frag = urlparse(url)
622-
defrag = urlunparse((s, n, p, a, q, ''))
645+
s, n, p, q, frag = _urlsplit(url)
646+
defrag = _urlunsplit(s, n, p, q, None)
623647
else:
624648
frag = ''
625649
defrag = url
626-
return _coerce_result(DefragResult(defrag, frag))
650+
return _coerce_result(DefragResult(defrag, frag or ''))
627651

628652
_hexdig = '0123456789ABCDEFabcdef'
629653
_hextobyte = None
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix :func:`urllib.parse.urljoin` and :func:`urllib.parse.urldefrag` for URIs
2+
containing empty components. For example, :func:`!urljoin()` with relative
3+
reference "?" now sets empty query and removes fragment.
4+
Preserve empty components (authority, params, query, fragment) in :func:`!urljoin`.
5+
Preserve empty components (authority, params, query) in :func:`!urldefrag`.

0 commit comments

Comments
 (0)