Skip to content

Commit

Permalink
ENH: Infer compression from non-string paths (pandas-dev#17206)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel authored and jowens committed Sep 20, 2017
1 parent 536b761 commit a1ff671
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 23 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ Other Enhancements
- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`)
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).

.. _whatsnew_0210.api_breaking:

Expand Down
14 changes: 8 additions & 6 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,13 +272,15 @@ def _infer_compression(filepath_or_buffer, compression):
if compression is None:
return None

# Cannot infer compression of a buffer. Hence assume no compression.
is_path = isinstance(filepath_or_buffer, compat.string_types)
if compression == 'infer' and not is_path:
return None

# Infer compression from the filename/URL extension
# Infer compression
if compression == 'infer':
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
# Cannot infer compression of a buffer, assume no compression
return None

# Infer compression from the filename/URL extension
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
Expand Down
10 changes: 5 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,11 @@
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
for more information on ``iterator`` and ``chunksize``.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
'.zip', or 'xz', respectively, and no decompression otherwise. If using
'zip', the ZIP file must contain only one data file to be read in.
Set to None for no decompression.
For on-the-fly decompression of on-disk data. If 'infer' and
`filepath_or_buffer` is path-like, then detect compression from the
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
decompression). If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def read_pickle(path, compression='infer'):
File path
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
or 'zip' respectively, and no decompression otherwise.
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Set to None for no decompression.
.. versionadded:: 0.20.0
Expand Down
38 changes: 28 additions & 10 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@

from pandas import read_csv, concat

try:
from pathlib import Path
except ImportError:
pass

try:
from py.path import local as LocalPath
except ImportError:
pass


class CustomFSPath(object):
"""For testing fspath on unknown objects"""
Expand All @@ -34,6 +24,21 @@ def __fspath__(self):
return self.path


# Functions that consume a string path and return a string or path-like object
path_types = [str, CustomFSPath]

try:
from pathlib import Path
path_types.append(Path)
except ImportError:
pass

try:
from py.path import local as LocalPath
path_types.append(LocalPath)
except ImportError:
pass

HERE = os.path.dirname(__file__)


Expand Down Expand Up @@ -83,6 +88,19 @@ def test_stringify_path_fspath(self):
result = common._stringify_path(p)
assert result == 'foo/bar.csv'

@pytest.mark.parametrize('extension,expected', [
('', None),
('.gz', 'gzip'),
('.bz2', 'bz2'),
('.zip', 'zip'),
('.xz', 'xz'),
])
@pytest.mark.parametrize('path_type', path_types)
def test_infer_compression_from_path(self, extension, expected, path_type):
path = path_type('foo/bar.csv' + extension)
compression = common._infer_compression(path, compression='infer')
assert compression == expected

def test_get_filepath_or_buffer_with_path(self):
filename = '~/sometest'
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
Expand Down

0 comments on commit a1ff671

Please sign in to comment.