Skip to content

[WIP] ENH: support reading directory in read_csv #61275

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 81 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
84d6bd3
Add Pandas Cookbook to Book Recommendations (#61271)
WillAyd Apr 11, 2025
16cf492
bug fix
fangchenli Apr 12, 2025
b69fad1
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Apr 13, 2025
822dffc
fix win related error
fangchenli Apr 13, 2025
5637dca
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Apr 18, 2025
3905f1c
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Apr 21, 2025
361c41c
add encoding
fangchenli Apr 21, 2025
02f93bd
fix import
fangchenli Apr 21, 2025
c77158e
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 4, 2025
179f911
format
fangchenli May 4, 2025
d7bef62
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 6, 2025
db1c7ed
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 8, 2025
91a7956
improve test
fangchenli May 8, 2025
8b5cdd4
debug for new fsspec
fangchenli May 8, 2025
13c1258
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 9, 2025
abce2fd
debug min version fsspec
fangchenli May 9, 2025
70bcb2a
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 9, 2025
b99b641
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 11, 2025
14d7afc
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 13, 2025
2a445f3
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 19, 2025
3173270
format
fangchenli May 22, 2025
f94a0bf
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 22, 2025
38bed64
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli May 24, 2025
2a66b92
Merge remote-tracking branch 'upstream' into read-csv-from-directory
fangchenli May 30, 2025
a2b65e1
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 2, 2025
d77d290
fix format
fangchenli Jun 2, 2025
2eee5e2
fix test
fangchenli Jun 5, 2025
b6b48e9
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 7, 2025
f3a10e0
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 8, 2025
103db50
fix typing
fangchenli Jun 8, 2025
b7e055c
fix remote path
fangchenli Jun 8, 2025
48770ec
update docstring, fix error msg
fangchenli Jun 8, 2025
afbfc6a
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 10, 2025
6e8aec7
fix fsspec test
fangchenli Jun 10, 2025
f4ecd5e
handle chained url
fangchenli Jun 10, 2025
8744d85
remove remote test
fangchenli Jun 10, 2025
109edd8
fix windows
fangchenli Jun 10, 2025
1f310d7
try to fix win
fangchenli Jun 11, 2025
f86728f
handle http
fangchenli Jun 11, 2025
b5a0d1d
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 12, 2025
41cdb25
fix win
fangchenli Jun 15, 2025
9d1a055
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 15, 2025
dc3ac22
fix win
fangchenli Jun 16, 2025
3e4b032
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 16, 2025
56bb9a6
fix error msg
fangchenli Jun 16, 2025
f2f6a8c
fix test
fangchenli Jun 16, 2025
a7e04d3
add iterdir test
fangchenli Jun 16, 2025
3229482
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 18, 2025
e74fe09
add local test
fangchenli Jun 18, 2025
73b8ffe
try to fix win
fangchenli Jun 18, 2025
977358b
debug win
fangchenli Jun 19, 2025
a7ebbcd
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 19, 2025
32951cc
reactivate tests
fangchenli Jun 20, 2025
c8f41ee
roll back
fangchenli Jun 20, 2025
594a81b
rollback
fangchenli Jun 20, 2025
2f0bc00
debug win
fangchenli Jun 20, 2025
3f2e164
debug win
fangchenli Jun 20, 2025
c756492
debug win
fangchenli Jun 21, 2025
5e269ad
debug win
fangchenli Jun 21, 2025
b23caa8
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 24, 2025
1dd4080
fix tyoe, debug url request
fangchenli Jun 24, 2025
1c3f6fa
url passthrough
fangchenli Jun 24, 2025
4aca1b5
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 25, 2025
91c9e6d
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jun 25, 2025
d85b997
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jul 1, 2025
4e6cde7
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jul 2, 2025
5600c45
fix s3 url
fangchenli Jul 2, 2025
6ccbc91
fix none
fangchenli Jul 3, 2025
47e47a2
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jul 3, 2025
a3eebbb
fix error msg
fangchenli Jul 3, 2025
4e75a1d
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jul 3, 2025
90a8441
Merge remote-tracking branch 'upstream/main' into read-csv-from-direc…
fangchenli Jul 3, 2025
63bbde8
ignore type
fangchenli Jul 3, 2025
ff6fcf1
ignore type
fangchenli Jul 3, 2025
9040aa5
fix
fangchenli Jul 3, 2025
57813e4
fix mock remote dir test
fangchenli Jul 4, 2025
577726e
fix typo
fangchenli Jul 4, 2025
f2f70bb
format
fangchenli Jul 4, 2025
3ffcb4e
cleanup multi file generator
fangchenli Jul 4, 2025
daf6945
cleanup test
fangchenli Jul 4, 2025
22c6579
format
fangchenli Jul 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 203 additions & 1 deletion pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from collections import defaultdict
from collections.abc import (
Hashable,
Iterable,
Mapping,
Sequence,
)
Expand All @@ -26,7 +27,10 @@
)
import mmap
import os
from pathlib import Path
from pathlib import (
Path,
PurePosixPath,
)
import re
import tarfile
from typing import (
Expand All @@ -42,6 +46,7 @@
overload,
)
from urllib.parse import (
unquote,
urljoin,
urlparse as parse_url,
uses_netloc,
Expand All @@ -55,6 +60,7 @@
BaseBuffer,
ReadCsvBuffer,
)
from pandas.compat import is_platform_windows
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
Expand Down Expand Up @@ -1282,3 +1288,199 @@ def dedup_names(
counts[col] = cur_count + 1

return names


def _infer_protocol(path: str) -> str:
"""
Infer the protocol of a given path string.
Parameters
----------
path : str
The path string to infer the protocol from.
Returns
-------
str
The inferred protocol.
"""
# Treat Windows drive letters like C:\ as local file paths
if is_platform_windows() and re.match(r"^[a-zA-Z]:[\\/]", path):
return "file"

if is_fsspec_url(path) or path.startswith("http"):
parsed = parse_url(path)
return parsed.scheme
return "file"


def _match_file(
path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None
) -> bool:
"""
Check if the file matches the given extensions and glob pattern.
Parameters
----------
path : Path or PurePosixPath
The file path to check.
extensions : set[str]
A set of file extensions to match against.
glob : str
A glob pattern to match against.
Returns
-------
bool
True if the file matches the extensions and glob pattern, False otherwise.
"""
return (extensions is None or path.suffix.lower() in extensions) and (
glob is None or path.match(glob)
)


def _resolve_local_path(path_str: str) -> Path:
"""
Resolve a local file path, handling Windows paths and file URLs.
Parameters
----------
path_str : str
The path string to resolve.
Returns
-------
Path
A Path object representing the resolved local path.
"""
parsed = parse_url(path_str)

if is_platform_windows():
if parsed.scheme == "file":
if parsed.netloc:
return Path(f"//{parsed.netloc}{unquote(parsed.path)}")
return Path(unquote(parsed.path.lstrip("/")))

if re.match(r"^[a-zA-Z]:[\\/]", path_str):
return Path(unquote(path_str))

return Path(unquote(parsed.path))


def iterdir(
path: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
extensions: str | Iterable[str] | None = None,
glob: str | None = None,
storage_options: StorageOptions | None = None,
) -> FilePath | list[FilePath] | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]:
"""
Yield file paths in a directory (no nesting allowed). File-like objects
and string URLs are returned directly. Remote paths are handled via fsspec.

Supports:
- Local paths (str, os.PathLike)
- file:// URLs
- Remote paths (e.g., s3://) via fsspec (if installed)

Parameters
----------
path : FilePath
Path to the directory (local or remote).
extensions : str or list of str, optional
Only yield files with the given extension(s). Case-insensitive.
If None, all files are yielded.
glob : str, optional
Only yield files matching the given glob pattern.
If None, all files are yielded.

Returns
------
list of str or Path, BaseBuffer
If `path` is a file-like object, returns it directly.
Otherwise, returns list of file paths in the directory.

Raises
------
TypeError
If `path` is not a string, os.PathLike, or file-like object.
FileNotFoundError
If the specified path does not exist.
ValueError
If the specified path is neither a file nor a directory.
ImportError
If fsspec is required but not installed.
"""

# file-like objects and urls are returned directly
if hasattr(path, "read") or hasattr(path, "write") or is_url(path):
return path

if not isinstance(path, (str, os.PathLike)):
raise TypeError(
f"Expected file path name or file-like object, got {type(path)} type"
)

if extensions is not None:
if isinstance(extensions, str):
extensions = {extensions.lower()}
else:
extensions = {ext.lower() for ext in extensions}

path_str = os.fspath(path)
scheme = _infer_protocol(path_str)

if scheme == "file":
resolved_path = _resolve_local_path(path_str)
if not resolved_path.exists():
raise FileNotFoundError(f"No such file or directory: '{resolved_path}'")

result = []
if resolved_path.is_file():
if _match_file(
resolved_path,
extensions,
glob,
):
result.append(resolved_path)
return result # type: ignore[return-value]

if resolved_path.is_dir():
for entry in resolved_path.iterdir():
if entry.is_file():
if _match_file(
entry,
extensions,
glob,
):
result.append(entry)
return result # type: ignore[return-value]

raise ValueError(
f"The path '{resolved_path}' is neither a file nor a directory."
)

# Remote paths
fsspec = import_optional_dependency("fsspec")

# GH #11071
# Two legacy S3 protocols (s3n and s3a) are replaced with s3
if path_str.startswith("s3n://"):
path_str = path_str.replace("s3n://", "s3://")
if path_str.startswith("s3a://"):
path_str = path_str.replace("s3a://", "s3://")

fs, inner_path = fsspec.core.url_to_fs(path_str, **(storage_options or {}))
if fs.isfile(inner_path):
path_obj = PurePosixPath(inner_path)
if _match_file(
inner_path,
extensions,
glob,
):
return [path]

result = []
for file in fs.ls(inner_path, detail=True):
if file["type"] == "file":
path_obj = PurePosixPath(file["name"])
if _match_file(
path_obj,
extensions,
glob,
):
result.append(f"{scheme}://{path_obj}") # type: ignore[arg-type]
return result # type: ignore[return-value]
74 changes: 60 additions & 14 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from pandas.io.common import (
IOHandles,
get_handle,
iterdir,
stringify_path,
validate_header_arg,
)
Expand All @@ -73,6 +74,7 @@
if TYPE_CHECKING:
from collections.abc import (
Callable,
Generator,
Hashable,
Iterable,
Mapping,
Expand Down Expand Up @@ -668,9 +670,41 @@ def _validate_names(names: Sequence[Hashable] | None) -> None:
raise ValueError("Names should be an ordered collection.")


def _multi_file_generator(
list_of_files: list[FilePath], kwds
) -> Generator[DataFrame] | Generator[TextFileReader]:
"""
Generator that yields DataFrames or TextFileReaders for each file in the
provided list of files.
Parameters
----------
list_of_files : list of str or Path
List of file paths to read.
kwds : dict
Keyword arguments to pass to the TextFileReader.
Returns
-------
Generator[DataFrame] | Generator[TextFileReader]
A generator that yields DataFrames or TextFileReaders for each file.
"""

chunksize = kwds.get("chunksize", None)
iterator = kwds.get("iterator", False)
nrows = kwds.get("nrows", None)

for file in list_of_files:
parser = TextFileReader(file, **kwds)

if chunksize or iterator:
yield parser
else:
with parser:
yield parser.read(nrows)


def _read(
filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
) -> DataFrame | TextFileReader:
) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
"""Generic reader of line files."""
# if we pass a date_format and parse_dates=False, we should not parse the
# dates GH#44366
Expand Down Expand Up @@ -709,14 +743,27 @@ def _read(
# Check for duplicates in names.
_validate_names(kwds.get("names", None))

# Create the parser.
parser = TextFileReader(filepath_or_buffer, **kwds)
extensions = kwds.get("extensions", None)
glob = kwds.get("glob", None)
storage_options = kwds.get("storage_options", None)
files = iterdir(filepath_or_buffer, extensions, glob, storage_options)

if isinstance(files, list) and not files:
raise FileNotFoundError(
f"No files found in {filepath_or_buffer}, "
f"with extension(s) {extensions} and glob pattern {glob}"
)

if chunksize or iterator:
return parser
if (isinstance(files, list) and len(files) == 1) or not isinstance(files, list):
file = files[0] if isinstance(files, list) else files
parser = TextFileReader(file, **kwds)

with parser:
return parser.read(nrows)
if chunksize or iterator:
return parser

with parser:
return parser.read(nrows)
return _multi_file_generator(files, kwds)


@overload
Expand Down Expand Up @@ -832,7 +879,7 @@ def read_csv(
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand Down Expand Up @@ -932,10 +979,9 @@ def read_table(
skipfooter: int = 0,
nrows: int | None = None,
# NA and Missing Data Handling
na_values: Hashable
| Iterable[Hashable]
| Mapping[Hashable, Iterable[Hashable]]
| None = None,
na_values: (
Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None
) = None,
keep_default_na: bool = True,
na_filter: bool = True,
skip_blank_lines: bool = True,
Expand Down Expand Up @@ -968,7 +1014,7 @@ def read_table(
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand Down Expand Up @@ -1038,7 +1084,7 @@ def read_fwf(
iterator: bool = False,
chunksize: int | None = None,
**kwds: Unpack[_read_shared[HashableT]],
) -> DataFrame | TextFileReader:
) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
r"""
Read a table of fixed-width formatted lines into DataFrame.

Expand Down
Loading
Loading