pandas-dev · fangchenli · Apr 11, 2025 · Apr 12, 2025 · Apr 13, 2025 · Apr 13, 2025
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -10,6 +10,7 @@
 from collections import defaultdict
 from collections.abc import (
     Hashable,
+    Iterable,
     Mapping,
     Sequence,
 )
@@ -26,7 +27,10 @@
 )
 import mmap
 import os
-from pathlib import Path
+from pathlib import (
+    Path,
+    PurePosixPath,
+)
 import re
 import tarfile
 from typing import (
@@ -42,6 +46,7 @@
     overload,
 )
 from urllib.parse import (
+    unquote,
     urljoin,
     urlparse as parse_url,
     uses_netloc,
@@ -55,6 +60,7 @@
     BaseBuffer,
     ReadCsvBuffer,
 )
+from pandas.compat import is_platform_windows
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import doc
 from pandas.util._exceptions import find_stack_level
@@ -1282,3 +1288,199 @@ def dedup_names(
         counts[col] = cur_count + 1
 
     return names
+
+
+def _infer_protocol(path: str) -> str:
+    """
+    Infer the protocol of a given path string.
+    Parameters
+    ----------
+    path : str
+        The path string to infer the protocol from.
+    Returns
+    -------
+    str
+        The inferred protocol.
+    """
+    # Treat Windows drive letters like C:\ as local file paths
+    if is_platform_windows() and re.match(r"^[a-zA-Z]:[\\/]", path):
+        return "file"
+
+    if is_fsspec_url(path) or path.startswith("http"):
+        parsed = parse_url(path)
+        return parsed.scheme
+    return "file"
+
+
+def _match_file(
+    path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None
+) -> bool:
+    """
+    Check if the file matches the given extensions and glob pattern.
+    Parameters
+    ----------
+    path : Path or PurePosixPath
+        The file path to check.
+    extensions : set[str]
+        A set of file extensions to match against.
+    glob : str
+        A glob pattern to match against.
+    Returns
+    -------
+    bool
+        True if the file matches the extensions and glob pattern, False otherwise.
+    """
+    return (extensions is None or path.suffix.lower() in extensions) and (
+        glob is None or path.match(glob)
+    )
+
+
+def _resolve_local_path(path_str: str) -> Path:
+    """
+    Resolve a local file path, handling Windows paths and file URLs.
+    Parameters
+    ----------
+    path_str : str
+        The path string to resolve.
+    Returns
+    -------
+    Path
+        A Path object representing the resolved local path.
+    """
+    parsed = parse_url(path_str)
+
+    if is_platform_windows():
+        if parsed.scheme == "file":
+            if parsed.netloc:
+                return Path(f"//{parsed.netloc}{unquote(parsed.path)}")
+            return Path(unquote(parsed.path.lstrip("/")))
+
+        if re.match(r"^[a-zA-Z]:[\\/]", path_str):
+            return Path(unquote(path_str))
+
+    return Path(unquote(parsed.path))
+
+
+def iterdir(
+    path: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+    extensions: str | Iterable[str] | None = None,
+    glob: str | None = None,
+    storage_options: StorageOptions | None = None,
+) -> FilePath | list[FilePath] | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]:
+    """
+    Yield file paths in a directory (no nesting allowed). File-like objects
+    and string URLs are returned directly. Remote paths are handled via fsspec.
+
+    Supports:
+    - Local paths (str, os.PathLike)
+    - file:// URLs
+    - Remote paths (e.g., s3://) via fsspec (if installed)
+
+    Parameters
+    ----------
+    path : FilePath
+        Path to the directory (local or remote).
+    extensions : str or list of str, optional
+        Only yield files with the given extension(s). Case-insensitive.
+        If None, all files are yielded.
+    glob : str, optional
+        Only yield files matching the given glob pattern.
+        If None, all files are yielded.
+
+    Returns
+    ------
+    list of str or Path, BaseBuffer
+        If `path` is a file-like object, returns it directly.
+        Otherwise, returns list of file paths in the directory.
+
+    Raises
+    ------
+    TypeError
+        If `path` is not a string, os.PathLike, or file-like object.
+    FileNotFoundError
+        If the specified path does not exist.
+    ValueError
+        If the specified path is neither a file nor a directory.
+    ImportError
+        If fsspec is required but not installed.
+    """
+
+    # file-like objects and urls are returned directly
+    if hasattr(path, "read") or hasattr(path, "write") or is_url(path):
+        return path
+
+    if not isinstance(path, (str, os.PathLike)):
+        raise TypeError(
+            f"Expected file path name or file-like object, got {type(path)} type"
+        )
+
+    if extensions is not None:
+        if isinstance(extensions, str):
+            extensions = {extensions.lower()}
+        else:
+            extensions = {ext.lower() for ext in extensions}
+
+    path_str = os.fspath(path)
+    scheme = _infer_protocol(path_str)
+
+    if scheme == "file":
+        resolved_path = _resolve_local_path(path_str)
+        if not resolved_path.exists():
+            raise FileNotFoundError(f"No such file or directory: '{resolved_path}'")
+
+        result = []
+        if resolved_path.is_file():
+            if _match_file(
+                resolved_path,
+                extensions,
+                glob,
+            ):
+                result.append(resolved_path)
+                return result  # type: ignore[return-value]
+
+        if resolved_path.is_dir():
+            for entry in resolved_path.iterdir():
+                if entry.is_file():
+                    if _match_file(
+                        entry,
+                        extensions,
+                        glob,
+                    ):
+                        result.append(entry)
+            return result  # type: ignore[return-value]
+
+        raise ValueError(
+            f"The path '{resolved_path}' is neither a file nor a directory."
+        )
+
+    # Remote paths
+    fsspec = import_optional_dependency("fsspec")
+
+    # GH #11071
+    # Two legacy S3 protocols (s3n and s3a) are replaced with s3
+    if path_str.startswith("s3n://"):
+        path_str = path_str.replace("s3n://", "s3://")
+    if path_str.startswith("s3a://"):
+        path_str = path_str.replace("s3a://", "s3://")
+
+    fs, inner_path = fsspec.core.url_to_fs(path_str, **(storage_options or {}))
+    if fs.isfile(inner_path):
+        path_obj = PurePosixPath(inner_path)
+        if _match_file(
+            inner_path,
+            extensions,
+            glob,
+        ):
+            return [path]
+
+    result = []
+    for file in fs.ls(inner_path, detail=True):
+        if file["type"] == "file":
+            path_obj = PurePosixPath(file["name"])
+            if _match_file(
+                path_obj,
+                extensions,
+                glob,
+            ):
+                result.append(f"{scheme}://{path_obj}")  # type: ignore[arg-type]
+    return result  # type: ignore[return-value]
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -55,6 +55,7 @@
 from pandas.io.common import (
     IOHandles,
     get_handle,
+    iterdir,
     stringify_path,
     validate_header_arg,
 )
@@ -73,6 +74,7 @@
 if TYPE_CHECKING:
     from collections.abc import (
         Callable,
+        Generator,
         Hashable,
         Iterable,
         Mapping,
@@ -668,9 +670,41 @@ def _validate_names(names: Sequence[Hashable] | None) -> None:
             raise ValueError("Names should be an ordered collection.")
 
 
+def _multi_file_generator(
+    list_of_files: list[FilePath], kwds
+) -> Generator[DataFrame] | Generator[TextFileReader]:
+    """
+    Generator that yields DataFrames or TextFileReaders for each file in the
+    provided list of files.
+    Parameters
+    ----------
+    list_of_files : list of str or Path
+        List of file paths to read.
+    kwds : dict
+        Keyword arguments to pass to the TextFileReader.
+    Returns
+    -------
+    Generator[DataFrame] | Generator[TextFileReader]
+        A generator that yields DataFrames or TextFileReaders for each file.
+    """
+
+    chunksize = kwds.get("chunksize", None)
+    iterator = kwds.get("iterator", False)
+    nrows = kwds.get("nrows", None)
+
+    for file in list_of_files:
+        parser = TextFileReader(file, **kwds)
+
+        if chunksize or iterator:
+            yield parser
+        else:
+            with parser:
+                yield parser.read(nrows)
+
+
 def _read(
     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
-) -> DataFrame | TextFileReader:
+) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
     """Generic reader of line files."""
     # if we pass a date_format and parse_dates=False, we should not parse the
     # dates GH#44366
@@ -709,14 +743,27 @@ def _read(
     # Check for duplicates in names.
     _validate_names(kwds.get("names", None))
 
-    # Create the parser.
-    parser = TextFileReader(filepath_or_buffer, **kwds)
+    extensions = kwds.get("extensions", None)
+    glob = kwds.get("glob", None)
+    storage_options = kwds.get("storage_options", None)
+    files = iterdir(filepath_or_buffer, extensions, glob, storage_options)
+
+    if isinstance(files, list) and not files:
+        raise FileNotFoundError(
+            f"No files found in {filepath_or_buffer}, "
+            f"with extension(s) {extensions} and glob pattern {glob}"
+        )
 
-    if chunksize or iterator:
-        return parser
+    if (isinstance(files, list) and len(files) == 1) or not isinstance(files, list):
+        file = files[0] if isinstance(files, list) else files
+        parser = TextFileReader(file, **kwds)
 
-    with parser:
-        return parser.read(nrows)
+        if chunksize or iterator:
+            return parser
+
+        with parser:
+            return parser.read(nrows)
+    return _multi_file_generator(files, kwds)
 
 
 @overload
@@ -832,7 +879,7 @@ def read_csv(
     float_precision: Literal["high", "legacy", "round_trip"] | None = None,
     storage_options: StorageOptions | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | TextFileReader:
+) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
     # locals() should never be modified
     kwds = locals().copy()
     del kwds["filepath_or_buffer"]
@@ -932,10 +979,9 @@ def read_table(
     skipfooter: int = 0,
     nrows: int | None = None,
     # NA and Missing Data Handling
-    na_values: Hashable
-    | Iterable[Hashable]
-    | Mapping[Hashable, Iterable[Hashable]]
-    | None = None,
+    na_values: (
+        Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None
+    ) = None,
     keep_default_na: bool = True,
     na_filter: bool = True,
     skip_blank_lines: bool = True,
@@ -968,7 +1014,7 @@ def read_table(
     float_precision: Literal["high", "legacy", "round_trip"] | None = None,
     storage_options: StorageOptions | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | TextFileReader:
+) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
     # locals() should never be modified
     kwds = locals().copy()
     del kwds["filepath_or_buffer"]
@@ -1038,7 +1084,7 @@ def read_fwf(
     iterator: bool = False,
     chunksize: int | None = None,
     **kwds: Unpack[_read_shared[HashableT]],
-) -> DataFrame | TextFileReader:
+) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]:
     r"""
     Read a table of fixed-width formatted lines into DataFrame.