Skip to content

Commit

Permalink
filesystem.py find: return directories and improve performance (spack…
Browse files Browse the repository at this point in the history
  • Loading branch information
haampie authored Nov 11, 2024
1 parent 30db764 commit a9e6074
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 57 deletions.
84 changes: 44 additions & 40 deletions lib/spack/llnl/util/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1693,11 +1693,11 @@ def find(
recursive: bool = True,
max_depth: Optional[int] = None,
) -> List[str]:
"""Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
This function returns a deterministic result for the same input and directory structure when
run multiple times. Symlinked directories are followed, and unique directories are searched
only once. Each matching file is returned only once at lowest depth in case multiple paths
exist due to symlinked directories.
"""Finds all files matching the patterns from ``files`` starting from ``root``. This function
returns a deterministic result for the same input and directory structure when run multiple
times. Symlinked directories are followed, and unique directories are searched only once. Each
matching file is returned only once at lowest depth in case multiple paths exist due to
symlinked directories.
Accepts any glob characters accepted by fnmatch:
Expand Down Expand Up @@ -1830,54 +1830,58 @@ def _find_max_depth(
# Use glob.glob for complex patterns.
for pattern_name, pattern in complex_patterns.items():
matched_paths[pattern_name].extend(
path
for path in glob.glob(os.path.join(curr_dir, pattern))
if not os.path.isdir(path)
path for path in glob.glob(os.path.join(curr_dir, pattern))
)

# List of subdirectories by path and (inode, device) tuple
subdirs: List[Tuple[str, Tuple[int, int]]] = []

with dir_iter:
ordered_entries = sorted(dir_iter, key=lambda x: x.name)
for dir_entry in ordered_entries:
for dir_entry in dir_iter:

# Match filename only patterns
if filename_only_patterns:
m = regex.match(os.path.normcase(dir_entry.name))
if m:
for pattern_name in filename_only_patterns:
if m.group(pattern_name):
matched_paths[pattern_name].append(dir_entry.path)
break

# Collect subdirectories
if depth >= max_depth:
continue

try:
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
if not dir_entry.is_dir(follow_symlinks=True):
continue
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have
# to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path)
continue

if it_is_a_dir:
if depth >= max_depth:
continue
try:
# The stat should be performed in a try/except block. We repeat that here
# vs. moving to the above block because we only want to call `stat` if we
# haven't exceeded our max_depth
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
# have to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
_log_file_access_issue(e, dir_entry.path)
continue
subdirs.append((dir_entry.path, _file_id(stat_info)))

dir_id = _file_id(stat_info)
if dir_id not in visited_dirs:
dir_queue.appendleft((depth + 1, dir_entry.path))
visited_dirs.add(dir_id)
elif filename_only_patterns:
m = regex.match(os.path.normcase(dir_entry.name))
if not m:
continue
for pattern_name in filename_only_patterns:
if m.group(pattern_name):
matched_paths[pattern_name].append(dir_entry.path)
break
# Enqueue subdirectories in a deterministic order
if subdirs:
subdirs.sort(key=lambda s: os.path.basename(s[0]))
for subdir, subdir_id in subdirs:
if subdir_id not in visited_dirs:
dir_queue.appendleft((depth + 1, subdir))
visited_dirs.add(subdir_id)

# Sort the matched paths for deterministic output
for paths in matched_paths.values():
paths.sort()
all_matching_paths = [path for paths in matched_paths.values() for path in paths]

# we only dedupe files if we have any complex patterns, since only they can match the same file
# We only dedupe files if we have any complex patterns, since only they can match the same file
# multiple times
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths

Expand Down
22 changes: 5 additions & 17 deletions lib/spack/spack/test/llnl/util/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir):
<root>/
l1-d1/
l2-d1/
l3-s1 -> l1-d2 # points to directory above l2-d1
l3-d2/
l4-f1
l3-s3 -> l1-d1 # cyclic link
l3-d4/
l4-f2
l3-s1 -> l1-d2 # points to directory above l2-d1
l3-s3 -> l1-d1 # cyclic link
l1-d2/
l2-f1
l2-d2/
l3-f3
l2-f1
l2-s3 -> l2-d2
l1-s3 -> l3-d4 # a link that "skips" a directory level
l1-s4 -> l2-s3 # a link to a link to a dir
Expand All @@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir):
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
l2_d2 = l1_d2.join("l2-d2").ensure(dir=True)

if use_junctions:
link_fn = llnl.util.symlink._windows_create_junction
Expand Down Expand Up @@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure

def test_multiple_patterns(complex_dir_structure):
root, _ = complex_dir_structure
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"])
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"])
# There shouldn't be duplicate results with multiple, overlapping patterns
assert len(set(paths)) == len(paths)
# All files should be found
Expand Down Expand Up @@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path):

with pytest.raises(TypeError):
fs.find(1, "file.txt") # type: ignore


def test_find_only_finds_files(tmp_path: pathlib.Path):
"""ensure that find only returns files even at max_depth"""
(tmp_path / "subdir").mkdir()
(tmp_path / "subdir" / "dir").mkdir()
(tmp_path / "subdir" / "file.txt").write_text("")
assert (
fs.find(tmp_path, "*", max_depth=1)
== fs.find(tmp_path, "*/*", max_depth=1)
== [str(tmp_path / "subdir" / "file.txt")]
)

0 comments on commit a9e6074

Please sign in to comment.