Skip to content

Commit beb80d1

Browse files
authored
GH-113528: Deoptimise pathlib._abc.PurePathBase (#113559)
Apply pathlib's normalization and performance tuning in `pathlib.PurePath`, but not `pathlib._abc.PurePathBase`. With this change, the pathlib ABCs do not normalize away alternate path separators, empty segments, or dot segments. A single string given to the initialiser will round-trip by default, i.e. `str(PurePathBase(my_string)) == my_string`. Implementors can set their own path domain-specific normalization scheme by overriding `__str__()` Eliminating path normalization makes maintaining and caching the path's parts and string representation both optional and not very useful, so this commit moves the `_drv`, `_root`, `_tail_cached` and `_str` slots from `PurePathBase` to `PurePath`. Only `_raw_paths` and `_resolving` slots remain in `PurePathBase`. This frees the ABCs from the burden of some of pathlib's hardest-to-understand code.
1 parent 57bdc6c commit beb80d1

File tree

4 files changed

+195
-140
lines changed

4 files changed

+195
-140
lines changed

Lib/pathlib/__init__.py

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,20 @@ class PurePath(_abc.PurePathBase):
7676
"""
7777

7878
__slots__ = (
79+
# The `_drv`, `_root` and `_tail_cached` slots store parsed and
80+
# normalized parts of the path. They are set when any of the `drive`,
81+
# `root` or `_tail` properties are accessed for the first time. The
82+
# three-part division corresponds to the result of
83+
# `os.path.splitroot()`, except that the tail is further split on path
84+
# separators (i.e. it is a list of strings), and that the root and
85+
# tail are normalized.
86+
'_drv', '_root', '_tail_cached',
87+
88+
# The `_str` slot stores the string representation of the path,
89+
# computed from the drive, root and tail when `__str__()` is called
90+
# for the first time. It's used to implement `_str_normcase`
91+
'_str',
92+
7993
# The `_str_normcase_cached` slot stores the string path with
8094
# normalized case. It is set when the `_str_normcase` property is
8195
# accessed for the first time. It's used to implement `__eq__()`
@@ -196,6 +210,94 @@ def __ge__(self, other):
196210
return NotImplemented
197211
return self._parts_normcase >= other._parts_normcase
198212

213+
def __str__(self):
214+
"""Return the string representation of the path, suitable for
215+
passing to system calls."""
216+
try:
217+
return self._str
218+
except AttributeError:
219+
self._str = self._format_parsed_parts(self.drive, self.root,
220+
self._tail) or '.'
221+
return self._str
222+
223+
@classmethod
224+
def _format_parsed_parts(cls, drv, root, tail):
225+
if drv or root:
226+
return drv + root + cls.pathmod.sep.join(tail)
227+
elif tail and cls.pathmod.splitdrive(tail[0])[0]:
228+
tail = ['.'] + tail
229+
return cls.pathmod.sep.join(tail)
230+
231+
def _from_parsed_parts(self, drv, root, tail):
232+
path_str = self._format_parsed_parts(drv, root, tail)
233+
path = self.with_segments(path_str)
234+
path._str = path_str or '.'
235+
path._drv = drv
236+
path._root = root
237+
path._tail_cached = tail
238+
return path
239+
240+
@classmethod
241+
def _parse_path(cls, path):
242+
if not path:
243+
return '', '', []
244+
sep = cls.pathmod.sep
245+
altsep = cls.pathmod.altsep
246+
if altsep:
247+
path = path.replace(altsep, sep)
248+
drv, root, rel = cls.pathmod.splitroot(path)
249+
if not root and drv.startswith(sep) and not drv.endswith(sep):
250+
drv_parts = drv.split(sep)
251+
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
252+
# e.g. //server/share
253+
root = sep
254+
elif len(drv_parts) == 6:
255+
# e.g. //?/unc/server/share
256+
root = sep
257+
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
258+
return drv, root, parsed
259+
260+
def _load_parts(self):
261+
paths = self._raw_paths
262+
if len(paths) == 0:
263+
path = ''
264+
elif len(paths) == 1:
265+
path = paths[0]
266+
else:
267+
path = self.pathmod.join(*paths)
268+
self._drv, self._root, self._tail_cached = self._parse_path(path)
269+
270+
@property
271+
def drive(self):
272+
"""The drive prefix (letter or UNC path), if any."""
273+
try:
274+
return self._drv
275+
except AttributeError:
276+
self._load_parts()
277+
return self._drv
278+
279+
@property
280+
def root(self):
281+
"""The root of the path, if any."""
282+
try:
283+
return self._root
284+
except AttributeError:
285+
self._load_parts()
286+
return self._root
287+
288+
@property
289+
def _tail(self):
290+
try:
291+
return self._tail_cached
292+
except AttributeError:
293+
self._load_parts()
294+
return self._tail_cached
295+
296+
@property
297+
def anchor(self):
298+
"""The concatenation of the drive and root, or ''."""
299+
return self.drive + self.root
300+
199301
@property
200302
def parts(self):
201303
"""An object providing sequence-like access to the
@@ -416,7 +518,7 @@ def iterdir(self):
416518
def _scandir(self):
417519
return os.scandir(self)
418520

419-
def _make_child_entry(self, entry):
521+
def _make_child_entry(self, entry, is_dir=False):
420522
# Transform an entry yielded from _scandir() into a path object.
421523
path_str = entry.name if str(self) == '.' else entry.path
422524
path = self.with_segments(path_str)

Lib/pathlib/_abc.py

Lines changed: 29 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import functools
22
import ntpath
33
import posixpath
4-
import sys
54
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
65
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
76

@@ -82,7 +81,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
8281
except OSError:
8382
continue
8483
if match(entry.name):
85-
yield parent_path._make_child_entry(entry)
84+
yield parent_path._make_child_entry(entry, dir_only)
8685

8786

8887
def _select_recursive(parent_paths, dir_only, follow_symlinks):
@@ -105,7 +104,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
105104
for entry in entries:
106105
try:
107106
if entry.is_dir(follow_symlinks=follow_symlinks):
108-
paths.append(path._make_child_entry(entry))
107+
paths.append(path._make_child_entry(entry, dir_only))
109108
continue
110109
except OSError:
111110
pass
@@ -147,20 +146,6 @@ class PurePathBase:
147146
# in the `__init__()` method.
148147
'_raw_paths',
149148

150-
# The `_drv`, `_root` and `_tail_cached` slots store parsed and
151-
# normalized parts of the path. They are set when any of the `drive`,
152-
# `root` or `_tail` properties are accessed for the first time. The
153-
# three-part division corresponds to the result of
154-
# `os.path.splitroot()`, except that the tail is further split on path
155-
# separators (i.e. it is a list of strings), and that the root and
156-
# tail are normalized.
157-
'_drv', '_root', '_tail_cached',
158-
159-
# The `_str` slot stores the string representation of the path,
160-
# computed from the drive, root and tail when `__str__()` is called
161-
# for the first time. It's used to implement `_str_normcase`
162-
'_str',
163-
164149
# The '_resolving' slot stores a boolean indicating whether the path
165150
# is being processed by `PathBase.resolve()`. This prevents duplicate
166151
# work from occurring when `resolve()` calls `stat()` or `readlink()`.
@@ -179,65 +164,16 @@ def with_segments(self, *pathsegments):
179164
"""
180165
return type(self)(*pathsegments)
181166

182-
@classmethod
183-
def _parse_path(cls, path):
184-
if not path:
185-
return '', '', []
186-
sep = cls.pathmod.sep
187-
altsep = cls.pathmod.altsep
188-
if altsep:
189-
path = path.replace(altsep, sep)
190-
drv, root, rel = cls.pathmod.splitroot(path)
191-
if not root and drv.startswith(sep) and not drv.endswith(sep):
192-
drv_parts = drv.split(sep)
193-
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
194-
# e.g. //server/share
195-
root = sep
196-
elif len(drv_parts) == 6:
197-
# e.g. //?/unc/server/share
198-
root = sep
199-
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
200-
return drv, root, parsed
201-
202-
def _load_parts(self):
203-
paths = self._raw_paths
204-
if len(paths) == 0:
205-
path = ''
206-
elif len(paths) == 1:
207-
path = paths[0]
208-
else:
209-
path = self.pathmod.join(*paths)
210-
drv, root, tail = self._parse_path(path)
211-
self._drv = drv
212-
self._root = root
213-
self._tail_cached = tail
214-
215-
def _from_parsed_parts(self, drv, root, tail):
216-
path_str = self._format_parsed_parts(drv, root, tail)
217-
path = self.with_segments(path_str)
218-
path._str = path_str or '.'
219-
path._drv = drv
220-
path._root = root
221-
path._tail_cached = tail
222-
return path
223-
224-
@classmethod
225-
def _format_parsed_parts(cls, drv, root, tail):
226-
if drv or root:
227-
return drv + root + cls.pathmod.sep.join(tail)
228-
elif tail and cls.pathmod.splitdrive(tail[0])[0]:
229-
tail = ['.'] + tail
230-
return cls.pathmod.sep.join(tail)
231-
232167
def __str__(self):
233168
"""Return the string representation of the path, suitable for
234169
passing to system calls."""
235-
try:
236-
return self._str
237-
except AttributeError:
238-
self._str = self._format_parsed_parts(self.drive, self.root,
239-
self._tail) or '.'
240-
return self._str
170+
paths = self._raw_paths
171+
if len(paths) == 1:
172+
return paths[0]
173+
elif paths:
174+
return self.pathmod.join(*paths)
175+
else:
176+
return ''
241177

242178
def as_posix(self):
243179
"""Return the string representation of the path with forward (/)
@@ -247,42 +183,23 @@ def as_posix(self):
247183
@property
248184
def drive(self):
249185
"""The drive prefix (letter or UNC path), if any."""
250-
try:
251-
return self._drv
252-
except AttributeError:
253-
self._load_parts()
254-
return self._drv
186+
return self.pathmod.splitdrive(str(self))[0]
255187

256188
@property
257189
def root(self):
258190
"""The root of the path, if any."""
259-
try:
260-
return self._root
261-
except AttributeError:
262-
self._load_parts()
263-
return self._root
264-
265-
@property
266-
def _tail(self):
267-
try:
268-
return self._tail_cached
269-
except AttributeError:
270-
self._load_parts()
271-
return self._tail_cached
191+
return self.pathmod.splitroot(str(self))[1]
272192

273193
@property
274194
def anchor(self):
275195
"""The concatenation of the drive and root, or ''."""
276-
anchor = self.drive + self.root
277-
return anchor
196+
drive, root, _ = self.pathmod.splitroot(str(self))
197+
return drive + root
278198

279199
@property
280200
def name(self):
281201
"""The final path component, if any."""
282-
path_str = str(self)
283-
if not path_str or path_str == '.':
284-
return ''
285-
return self.pathmod.basename(path_str)
202+
return self.pathmod.basename(str(self))
286203

287204
@property
288205
def suffix(self):
@@ -323,13 +240,10 @@ def stem(self):
323240

324241
def with_name(self, name):
325242
"""Return a new path with the file name changed."""
326-
m = self.pathmod
327-
if not name or m.sep in name or (m.altsep and m.altsep in name) or name == '.':
243+
dirname = self.pathmod.dirname
244+
if dirname(name):
328245
raise ValueError(f"Invalid name {name!r}")
329-
parent, old_name = m.split(str(self))
330-
if not old_name or old_name == '.':
331-
raise ValueError(f"{self!r} has an empty name")
332-
return self.with_segments(parent, name)
246+
return self.with_segments(dirname(str(self)), name)
333247

334248
def with_stem(self, stem):
335249
"""Return a new path with the stem changed."""
@@ -480,7 +394,7 @@ def is_absolute(self):
480394
def is_reserved(self):
481395
"""Return True if the path contains one of the special names reserved
482396
by the system, if any."""
483-
if self.pathmod is posixpath or not self._tail:
397+
if self.pathmod is posixpath or not self.name:
484398
return False
485399

486400
# NOTE: the rules for reserved names seem somewhat complicated
@@ -490,7 +404,7 @@ def is_reserved(self):
490404
if self.drive.startswith('\\\\'):
491405
# UNC paths are never reserved.
492406
return False
493-
name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
407+
name = self.name.partition('.')[0].partition(':')[0].rstrip(' ')
494408
return name.upper() in _WIN_RESERVED_NAMES
495409

496410
def match(self, path_pattern, *, case_sensitive=None):
@@ -503,9 +417,9 @@ def match(self, path_pattern, *, case_sensitive=None):
503417
case_sensitive = _is_case_sensitive(self.pathmod)
504418
sep = path_pattern.pathmod.sep
505419
pattern_str = str(path_pattern)
506-
if path_pattern.drive or path_pattern.root:
420+
if path_pattern.anchor:
507421
pass
508-
elif path_pattern._tail:
422+
elif path_pattern.parts:
509423
pattern_str = f'**{sep}{pattern_str}'
510424
else:
511425
raise ValueError("empty pattern")
@@ -780,8 +694,10 @@ def _scandir(self):
780694
from contextlib import nullcontext
781695
return nullcontext(self.iterdir())
782696

783-
def _make_child_entry(self, entry):
697+
def _make_child_entry(self, entry, is_dir=False):
784698
# Transform an entry yielded from _scandir() into a path object.
699+
if is_dir:
700+
return entry.joinpath('')
785701
return entry
786702

787703
def _make_child_relpath(self, name):
@@ -792,13 +708,13 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
792708
kind, including directories) matching the given relative pattern.
793709
"""
794710
path_pattern = self.with_segments(pattern)
795-
if path_pattern.drive or path_pattern.root:
711+
if path_pattern.anchor:
796712
raise NotImplementedError("Non-relative patterns are unsupported")
797-
elif not path_pattern._tail:
713+
elif not path_pattern.parts:
798714
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
799715

800-
pattern_parts = path_pattern._tail.copy()
801-
if pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
716+
pattern_parts = list(path_pattern.parts)
717+
if not self.pathmod.basename(pattern):
802718
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
803719
pattern_parts.append('')
804720

@@ -816,7 +732,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
816732
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
817733
deduplicate_paths = False
818734
sep = self.pathmod.sep
819-
paths = iter([self] if self.is_dir() else [])
735+
paths = iter([self.joinpath('')] if self.is_dir() else [])
820736
part_idx = 0
821737
while part_idx < len(pattern_parts):
822738
part = pattern_parts[part_idx]

0 commit comments

Comments
 (0)