Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,14 +362,23 @@ def resolve_pattern(
if protocol == "hf":
# 10 times faster glob with detail=True (ignores costly info like lastCommit)
glob_kwargs["expand_info"] = False
matched_paths = [
filepath if "://" in filepath else protocol_prefix + filepath
for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath))))
and (xbasename(filepath) not in files_to_ignore)
and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
] # ignore .ipynb and __pycache__, but keep /../
matched_paths = []
for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items():
# fsspec might report local symlinks as type=="other" and omit the islink flag.
# In that case, we still want to include them if they point to a regular file.
is_file = info.get("type") == "file"
is_link = bool(info.get("islink")) or (protocol == "file" and os.path.islink(filepath))
if not (is_file or (is_link and os.path.isfile(os.path.realpath(filepath)))):
continue
if xbasename(filepath) in files_to_ignore:
continue
if _is_inside_unrequested_special_dir(filepath, fs_pattern):
continue
if _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern):
continue

matched_paths.append(filepath if "://" in filepath else protocol_prefix + filepath)
# ignore .ipynb and __pycache__, but keep /../
if allowed_extensions is not None:
out = [
filepath
Expand Down
40 changes: 40 additions & 0 deletions tests/test_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,46 @@ def test_resolve_pattern_locally_does_not_resolve_symbolic_links(tmp_path, compl
assert Path(resolved_data_files[0]) == tmp_path / "train_data_symlink.txt"


@pytest.mark.skipif(os.name == "nt", reason="Windows does not support symlinks in the default mode")
def test_resolve_pattern_locally_glob_includes_symlinked_files(tmp_path):
"""
Some fsspec versions report local symlinks as type='other' and may omit the 'islink' flag.
We still want `resolve_pattern` to include them if they point to a regular file.
"""

(tmp_path / "blobs").mkdir()
(tmp_path / "data").mkdir()
real_file = tmp_path / "blobs" / "file.parquet"
real_file.write_bytes(b"parquet")
symlink_file = tmp_path / "data" / "train-00000-of-00001.parquet"
symlink_file.symlink_to(real_file)

pattern = str(tmp_path / "data" / "*.parquet")

from fsspec.core import url_to_fs as fsspec_url_to_fs

fs, fs_pattern = fsspec_url_to_fs(pattern)

class WrappedFS:
def __init__(self, fs):
self._fs = fs
self.protocol = fs.protocol

def glob(self, path, detail=True, **kwargs):
out = self._fs.glob(path, detail=detail, **kwargs)
if detail:
for info in out.values():
info["type"] = "other"
info.pop("islink", None)
return out

with patch("datasets.data_files.url_to_fs", return_value=(WrappedFS(fs), fs_pattern)):
resolved_data_files = resolve_pattern(pattern, str(tmp_path))

assert len(resolved_data_files) == 1
assert Path(resolved_data_files[0]) == symlink_file


def test_resolve_pattern_locally_sorted_files(tmp_path_factory):
path = str(tmp_path_factory.mktemp("unsorted_text_files"))
unsorted_names = ["0.txt", "2.txt", "3.txt"]
Expand Down