Skip to content

Commit

Permalink
Handle on the fly decompression of data files supported by pandas (yd…
Browse files Browse the repository at this point in the history
…ataai#657)

* Handle on the fly decompression of data files supported by pandas

Pandas can handle on the fly decompression from the following extensions:
‘.bz2’, ‘.gz’, ‘.zip’, or ‘.xz’ (otherwise no decompression). Now the
profiler can as well.

Co-authored-by: Davin Shearer <scholarsmate@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Jan 5, 2021
1 parent 1e086f4 commit 64ad4cc
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 6 deletions.
7 changes: 3 additions & 4 deletions requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
pytest
coverage<5
codecov
pytest-mypy>=0.7.0
pytest-mypy
pytest-cov
nbval>=0.9.6
ipykernel>=5.3.4
nbval
fastparquet==0.4.1
flake8
check-manifest>=0.45
check-manifest>=0.41
twine>=3.1.1
kaggle
55 changes: 54 additions & 1 deletion src/pandas_profiling/utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,55 @@ def warn_read(extension):
)


def is_supported_compression(file_extension: str) -> bool:
"""Determine if the given file extension indicates a compression format that pandas can handle automatically.
Args:
file_extension (str): the file extension to test
Returns:
bool: True if the extension indicates a compression format that pandas handles automatically and False otherwise
Notes:
Pandas can handle on the fly decompression from the following extensions: ‘.bz2’, ‘.gz’, ‘.zip’, or ‘.xz’
(otherwise no decompression). If using ‘.zip’, the ZIP file must contain exactly one data file to be read in.
"""
return file_extension.lower() in [".bz2", ".gz", ".xz", ".zip"]


def remove_suffix(text: str, suffix: str) -> str:
"""Removes the given suffix from the given string.
Args:
text (str): the string to remove the suffix from
suffix (str): the suffix to remove from the string
Returns:
str: the string with the suffix removed, if the string ends with the suffix, otherwise the unmodified string
Notes:
In python 3.9+, there is a built-in string method called removesuffix() that can serve this purpose.
"""
return text[: -len(suffix)] if suffix and text.endswith(suffix) else text


def uncompressed_extension(file_name: Path) -> str:
"""Returns the uncompressed extension of the given file name.
Args:
file_name (Path): the file name to get the uncompressed extension of
Returns:
str: the uncompressed extension, or the original extension if pandas doesn't handle it automatically
"""
extension = file_name.suffix.lower()
return (
Path(remove_suffix(str(file_name).lower(), extension)).suffix
if is_supported_compression(extension)
else extension
)


def read_pandas(file_name: Path) -> pd.DataFrame:
"""Read DataFrame based on the file extension. This function is used when the file is in a standard format.
Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)
Expand All @@ -40,7 +89,7 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
user input, which is currently used in the editor integration. For more advanced use cases, the user should load
the DataFrame in code.
"""
extension = file_name.suffix.lower()
extension = uncompressed_extension(file_name)
if extension == ".json":
df = pd.read_json(str(file_name))
elif extension == ".jsonl":
Expand All @@ -59,6 +108,10 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
df = pd.read_parquet(str(file_name))
elif extension in [".pkl", ".pickle"]:
df = pd.read_pickle(str(file_name))
elif extension == ".tar":
raise ValueError(
"tar compression is not supported directly by pandas, please use the 'tarfile' module"
)
else:
if extension != ".csv":
warn_read(extension)
Expand Down
17 changes: 16 additions & 1 deletion tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
import pandas as pd
import pytest

from pandas_profiling.utils.dataframe import expand_mixed, read_pandas, warn_read
from pandas_profiling.utils.dataframe import (
expand_mixed,
read_pandas,
uncompressed_extension,
warn_read,
)


def test_read_pandas_parquet():
Expand Down Expand Up @@ -47,3 +52,13 @@ def test_expand():
df = pd.DataFrame(data=[{"name": "John", "age": 30}, {"name": "Alice", "age": 25}])
expanded_df = expand_mixed(df)
assert expanded_df.shape == (2, 2)


def test_remove_compression_ext():
assert uncompressed_extension(Path("dataset.csv.gz")) == ".csv"
assert uncompressed_extension(Path("dataset.tsv.xz")) == ".tsv"


def test_remove_unsupported_ext():
with pytest.raises(ValueError):
read_pandas(Path("dataset.json.tar.gz"))

0 comments on commit 64ad4cc

Please sign in to comment.