Skip to content

Commit

Permalink
llnl.util.filesystem: multiple entrypoints and max_depth (spack#47495)
Browse files Browse the repository at this point in the history
If a package `foo` doesn't implement `libs`, the default was to search recursively for `libfoo` whenever asking for `spec[foo].libs` (this also happens automatically if a package includes `foo` as a link dependency).

This can lead to some strange behavior:
1. A package that is normally used as a build dependency (e.g. `cmake` at one point) is referenced like
   `depends_on(cmake)` which leads to a fully-recursive search for `libcmake` (this can take
   "forever" when CMake is registered as an external with a prefix like `/usr`, particularly on NFS mounts).
2. A similar hang can occur if a package is registered as an external with an incorrect prefix

- [x] Update the default library search to stop after a maximum depth (by default, search
  the root prefix and each directory in it, but no lower).
- [x] 

The following is a list of known changes to `find` compared to `develop`:

1. Matching directories are no longer returned -- `find` consistently only finds non-dirs, 
   even at `max_depth`
2. Symlinked directories are followed (needed to support max_depth)
3. `find(..., "dir/*.txt")` is allowed, for finding files inside certain dirs. These "complex"
   patterns are delegated to `glob`, like they are on `develop`.
4. `root` and `files` arguments both support generic sequences, and `root`
   allows both `str` and `path` types. This allows us to specify multiple entry points to `find`.

---------

Co-authored-by: Peter Scheibel <scheibel1@llnl.gov>
  • Loading branch information
haampie and scheibelp authored Nov 8, 2024
1 parent 4778d2d commit 907a371
Show file tree
Hide file tree
Showing 7 changed files with 457 additions and 113 deletions.
265 changes: 190 additions & 75 deletions lib/spack/llnl/util/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,23 @@
import tempfile
from contextlib import contextmanager
from itertools import accumulate
from typing import Callable, Iterable, List, Match, Optional, Tuple, Union
from typing import (
Callable,
Deque,
Dict,
Iterable,
List,
Match,
Optional,
Sequence,
Set,
Tuple,
Union,
)

import llnl.util.symlink
from llnl.util import tty
from llnl.util.lang import dedupe, memoized
from llnl.util.lang import dedupe, fnmatch_translate_multiple, memoized
from llnl.util.symlink import islink, readlink, resolve_link_target_relative_to_the_link, symlink

from ..path import path_to_os_path, system_path_filter
Expand Down Expand Up @@ -85,6 +97,8 @@
"visit_directory_tree",
]

Path = Union[str, pathlib.Path]

if sys.version_info < (3, 7, 4):
# monkeypatch shutil.copystat to fix PermissionError when copying read-only
# files on Lustre when using Python < 3.7.4
Expand Down Expand Up @@ -1673,105 +1687,199 @@ def find_first(root: str, files: Union[Iterable[str], str], bfs_depth: int = 2)
return FindFirstFile(root, *files, bfs_depth=bfs_depth).find()


def find(root, files, recursive=True):
"""Search for ``files`` starting from the ``root`` directory.
Like GNU/BSD find but written entirely in Python.
Examples:
.. code-block:: console
$ find /usr -name python
is equivalent to:
>>> find('/usr', 'python')
.. code-block:: console
$ find /usr/local/bin -maxdepth 1 -name python
is equivalent to:
>>> find('/usr/local/bin', 'python', recursive=False)
def find(
root: Union[Path, Sequence[Path]],
files: Union[str, Sequence[str]],
recursive: bool = True,
max_depth: Optional[int] = None,
) -> List[str]:
"""Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
This function returns a deterministic result for the same input and directory structure when
run multiple times. Symlinked directories are followed, and unique directories are searched
only once. Each matching file is returned only once at lowest depth in case multiple paths
exist due to symlinked directories.
Accepts any glob characters accepted by fnmatch:
========== ====================================
Pattern Meaning
========== ====================================
``*`` matches everything
``*`` matches one or more characters
``?`` matches any single character
``[seq]`` matches any character in ``seq``
``[!seq]`` matches any character not in ``seq``
========== ====================================
Examples:
>>> find("/usr", "*.txt", recursive=True, max_depth=2)
finds all files with the extension ``.txt`` in the directory ``/usr`` and subdirectories up to
depth 2.
>>> find(["/usr", "/var"], ["*.txt", "*.log"], recursive=True)
finds all files with the extension ``.txt`` or ``.log`` in the directories ``/usr`` and
``/var`` at any depth.
>>> find("/usr", "GL/*.h", recursive=True)
finds all header files in a directory GL at any depth in the directory ``/usr``.
Parameters:
root (str): The root directory to start searching from
files (str or collections.abc.Sequence): Library name(s) to search for
recursive (bool): if False search only root folder,
if True descends top-down from the root. Defaults to True.
root: One or more root directories to start searching from
files: One or more filename patterns to search for
recursive: if False search only root, if True descends from roots. Defaults to True.
max_depth: if set, don't search below this depth. Cannot be set if recursive is False
Returns:
list: The files that have been found
Returns a list of absolute, matching file paths.
"""
if isinstance(root, (str, pathlib.Path)):
root = [root]
elif not isinstance(root, collections.abc.Sequence):
raise TypeError(f"'root' arg must be a path or a sequence of paths, not '{type(root)}']")

if isinstance(files, str):
files = [files]
elif not isinstance(files, collections.abc.Sequence):
raise TypeError(f"'files' arg must be str or a sequence of str, not '{type(files)}']")

if recursive:
tty.debug(f"Find (recursive): {root} {str(files)}")
result = _find_recursive(root, files)
else:
tty.debug(f"Find (not recursive): {root} {str(files)}")
result = _find_non_recursive(root, files)
# If recursive is false, max_depth can only be None or 0
if max_depth and not recursive:
raise ValueError(f"max_depth ({max_depth}) cannot be set if recursive is False")

tty.debug(f"Find complete: {root} {str(files)}")
tty.debug(f"Find (max depth = {max_depth}): {root} {files}")
if not recursive:
max_depth = 0
elif max_depth is None:
max_depth = sys.maxsize
result = _find_max_depth(root, files, max_depth)
tty.debug(f"Find complete: {root} {files}")
return result


@system_path_filter
def _find_recursive(root, search_files):
# The variable here is **on purpose** a defaultdict. The idea is that
# we want to poke the filesystem as little as possible, but still maintain
# stability in the order of the answer. Thus we are recording each library
# found in a key, and reconstructing the stable order later.
found_files = collections.defaultdict(list)

# Make the path absolute to have os.walk also return an absolute path
root = os.path.abspath(root)
for path, _, list_files in os.walk(root):
for search_file in search_files:
matches = glob.glob(os.path.join(path, search_file))
matches = [os.path.join(path, x) for x in matches]
found_files[search_file].extend(matches)
def _log_file_access_issue(e: OSError, path: str) -> None:
errno_name = errno.errorcode.get(e.errno, "UNKNOWN")
tty.debug(f"find must skip {path}: {errno_name} {e}")

answer = []
for search_file in search_files:
answer.extend(found_files[search_file])

return answer
def _file_id(s: os.stat_result) -> Tuple[int, int]:
# Note: on windows, st_ino is the file index and st_dev is the volume serial number. See
# https://github.com/python/cpython/blob/3.9/Python/fileutils.c
return (s.st_ino, s.st_dev)


@system_path_filter
def _find_non_recursive(root, search_files):
# The variable here is **on purpose** a defaultdict as os.list_dir
# can return files in any order (does not preserve stability)
found_files = collections.defaultdict(list)
def _dedupe_files(paths: List[str]) -> List[str]:
"""Deduplicate files by inode and device, dropping files that cannot be accessed."""
unique_files: List[str] = []
# tuple of (inode, device) for each file without following symlinks
visited: Set[Tuple[int, int]] = set()
for path in paths:
try:
stat_info = os.lstat(path)
except OSError as e:
_log_file_access_issue(e, path)
continue
file_id = _file_id(stat_info)
if file_id not in visited:
unique_files.append(path)
visited.add(file_id)
return unique_files


def _find_max_depth(
roots: Sequence[Path], globs: Sequence[str], max_depth: int = sys.maxsize
) -> List[str]:
"""See ``find`` for the public API."""
# We optimize for the common case of simple filename only patterns: a single, combined regex
# is used. For complex patterns that include path components, we use a slower glob call from
# every directory we visit within max_depth.
filename_only_patterns = {
f"pattern_{i}": os.path.normcase(x) for i, x in enumerate(globs) if "/" not in x
}
complex_patterns = {f"pattern_{i}": x for i, x in enumerate(globs) if "/" in x}
regex = re.compile(fnmatch_translate_multiple(filename_only_patterns))
# Ordered dictionary that keeps track of what pattern found which files
matched_paths: Dict[str, List[str]] = {f"pattern_{i}": [] for i, _ in enumerate(globs)}
# Ensure returned paths are always absolute
roots = [os.path.abspath(r) for r in roots]
# Breadth-first search queue. Each element is a tuple of (depth, dir)
dir_queue: Deque[Tuple[int, str]] = collections.deque()
# Set of visited directories. Each element is a tuple of (inode, device)
visited_dirs: Set[Tuple[int, int]] = set()

for root in roots:
try:
stat_root = os.stat(root)
except OSError as e:
_log_file_access_issue(e, root)
continue
dir_id = _file_id(stat_root)
if dir_id not in visited_dirs:
dir_queue.appendleft((0, root))
visited_dirs.add(dir_id)

# Make the path absolute to have absolute path returned
root = os.path.abspath(root)
while dir_queue:
depth, curr_dir = dir_queue.pop()
try:
dir_iter = os.scandir(curr_dir)
except OSError as e:
_log_file_access_issue(e, curr_dir)
continue

for search_file in search_files:
matches = glob.glob(os.path.join(root, search_file))
matches = [os.path.join(root, x) for x in matches]
found_files[search_file].extend(matches)
# Use glob.glob for complex patterns.
for pattern_name, pattern in complex_patterns.items():
matched_paths[pattern_name].extend(
path
for path in glob.glob(os.path.join(curr_dir, pattern))
if not os.path.isdir(path)
)

answer = []
for search_file in search_files:
answer.extend(found_files[search_file])
with dir_iter:
ordered_entries = sorted(dir_iter, key=lambda x: x.name)
for dir_entry in ordered_entries:
try:
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
except OSError as e:
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path)
continue

return answer
if it_is_a_dir:
if depth >= max_depth:
continue
try:
# The stat should be performed in a try/except block. We repeat that here
# vs. moving to the above block because we only want to call `stat` if we
# haven't exceeded our max_depth
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
# have to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
_log_file_access_issue(e, dir_entry.path)
continue

dir_id = _file_id(stat_info)
if dir_id not in visited_dirs:
dir_queue.appendleft((depth + 1, dir_entry.path))
visited_dirs.add(dir_id)
elif filename_only_patterns:
m = regex.match(os.path.normcase(dir_entry.name))
if not m:
continue
for pattern_name in filename_only_patterns:
if m.group(pattern_name):
matched_paths[pattern_name].append(dir_entry.path)
break

all_matching_paths = [path for paths in matched_paths.values() for path in paths]

# we only dedupe files if we have any complex patterns, since only they can match the same file
# multiple times
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths


# Utilities for libraries and headers
Expand Down Expand Up @@ -2210,7 +2318,9 @@ def find_system_libraries(libraries, shared=True):
return libraries_found


def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
def find_libraries(
libraries, root, shared=True, recursive=False, runtime=True, max_depth: Optional[int] = None
):
"""Returns an iterable of full paths to libraries found in a root dir.
Accepts any glob characters accepted by fnmatch:
Expand All @@ -2231,6 +2341,8 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
otherwise for static. Defaults to True.
recursive (bool): if False search only root folder,
if True descends top-down from the root. Defaults to False.
max_depth (int): if set, don't search below this depth. Cannot be set
if recursive is False
runtime (bool): Windows only option, no-op elsewhere. If true,
search for runtime shared libs (.DLL), otherwise, search
for .Lib files. If shared is false, this has no meaning.
Expand All @@ -2239,6 +2351,7 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
Returns:
LibraryList: The libraries that have been found
"""

if isinstance(libraries, str):
libraries = [libraries]
elif not isinstance(libraries, collections.abc.Sequence):
Expand Down Expand Up @@ -2271,8 +2384,10 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
libraries = ["{0}.{1}".format(lib, suffix) for lib in libraries for suffix in suffixes]

if not recursive:
if max_depth:
raise ValueError(f"max_depth ({max_depth}) cannot be set if recursive is False")
# If not recursive, look for the libraries directly in root
return LibraryList(find(root, libraries, False))
return LibraryList(find(root, libraries, recursive=False))

# To speedup the search for external packages configured e.g. in /usr,
# perform first non-recursive search in root/lib then in root/lib64 and
Expand All @@ -2290,7 +2405,7 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
if found_libs:
break
else:
found_libs = find(root, libraries, True)
found_libs = find(root, libraries, recursive=True, max_depth=max_depth)

return LibraryList(found_libs)

Expand Down
17 changes: 16 additions & 1 deletion lib/spack/llnl/util/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@

import collections.abc
import contextlib
import fnmatch
import functools
import itertools
import os
import re
import sys
import traceback
import typing
import warnings
from datetime import datetime, timedelta
from typing import Callable, Iterable, List, Tuple, TypeVar
from typing import Callable, Dict, Iterable, List, Tuple, TypeVar

# Ignore emacs backups when listing modules
ignore_modules = r"^\.#|~$"
Expand Down Expand Up @@ -859,6 +861,19 @@ def elide_list(line_list: List[str], max_num: int = 10) -> List[str]:
return line_list


if sys.version_info >= (3, 9):
PatternStr = re.Pattern[str]
else:
PatternStr = typing.Pattern[str]


def fnmatch_translate_multiple(named_patterns: Dict[str, str]) -> str:
"""Similar to ``fnmatch.translate``, but takes an ordered dictionary where keys are pattern
names, and values are filename patterns. The output is a regex that matches any of the
patterns in order, and named capture groups are used to identify which pattern matched."""
return "|".join(f"(?P<{n}>{fnmatch.translate(p)})" for n, p in named_patterns.items())


@contextlib.contextmanager
def nullcontext(*args, **kwargs):
"""Empty context manager.
Expand Down
Loading

0 comments on commit 907a371

Please sign in to comment.