From fd133d4f21cd7f5cbf6bcf332290ce52e5501167 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Fri, 22 Nov 2024 00:29:05 +0000 Subject: [PATCH] GH-126601: `pathname2url()`: handle NTFS alternate data streams (#126760) Adjust `pathname2url()` to encode embedded colon characters in Windows paths, rather than bailing out with an `OSError`. Co-authored-by: Steve Dower --- Doc/library/urllib.request.rst | 5 +++++ Lib/nturl2path.py | 22 +++++++++---------- Lib/test/test_urllib.py | 5 +++-- ...-11-12-20-05-09.gh-issue-126601.Nj7bA9.rst | 3 +++ 4 files changed, 21 insertions(+), 14 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-12-20-05-09.gh-issue-126601.Nj7bA9.rst diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index cdd58b84a995b7..e0831bf7e65ad2 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -152,6 +152,11 @@ The :mod:`urllib.request` module defines the following functions: the path component of a URL. This does not produce a complete URL. The return value will already be quoted using the :func:`~urllib.parse.quote` function. + .. versionchanged:: 3.14 + On Windows, ``:`` characters not following a drive letter are quoted. In + previous versions, :exc:`OSError` was raised if a colon character was + found in any position other than the second character. + .. function:: url2pathname(path) diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 255eb2f547c2ce..ed7880fd1a775f 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -40,6 +40,7 @@ def pathname2url(p): # C:\foo\bar\spam.foo # becomes # ///C:/foo/bar/spam.foo + import ntpath import urllib.parse # First, clean up some special forms. We are going to sacrifice # the additional information anyway @@ -48,16 +49,13 @@ def pathname2url(p): p = p[4:] if p[:4].upper() == 'UNC/': p = '//' + p[4:] - elif p[1:2] != ':': - raise OSError('Bad path: ' + p) - if not ':' in p: - # No DOS drive specified, just quote the pathname - return urllib.parse.quote(p) - comp = p.split(':', maxsplit=2) - if len(comp) != 2 or len(comp[0]) > 1: - error = 'Bad path: ' + p - raise OSError(error) + drive, tail = ntpath.splitdrive(p) + if drive[1:] == ':': + # DOS drive specified. Add three slashes to the start, producing + # an authority section with a zero-length authority, and a path + # section starting with a single slash. + drive = f'///{drive.upper()}' - drive = urllib.parse.quote(comp[0].upper()) - tail = urllib.parse.quote(comp[1]) - return '///' + drive + ':' + tail + drive = urllib.parse.quote(drive, safe='/:') + tail = urllib.parse.quote(tail) + return drive + tail diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index c66b1c49c316e6..3e5dc256d317a7 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1429,8 +1429,9 @@ def test_pathname2url_win(self): self.assertEqual(fn('C:\\a\\b%#c'), '///C:/a/b%25%23c') self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") - # Long drive letter - self.assertRaises(IOError, fn, "XX:\\") + # NTFS alternate data streams + self.assertEqual(fn('C:\\foo:bar'), '///C:/foo%3Abar') + self.assertEqual(fn('foo:bar'), 'foo%3Abar') # No drive letter self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') diff --git a/Misc/NEWS.d/next/Library/2024-11-12-20-05-09.gh-issue-126601.Nj7bA9.rst b/Misc/NEWS.d/next/Library/2024-11-12-20-05-09.gh-issue-126601.Nj7bA9.rst new file mode 100644 index 00000000000000..11e2b7350a0e48 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-12-20-05-09.gh-issue-126601.Nj7bA9.rst @@ -0,0 +1,3 @@ +Fix issue where :func:`urllib.request.pathname2url` raised :exc:`OSError` +when given a Windows path containing a colon character not following a +drive letter, such as before an NTFS alternate data stream.