Handle on the fly decompression of data files supported by pandas (yd…

…ataai#657) * Handle on the fly decompression of data files supported by pandas Pandas can handle on the fly decompression from the following extensions: ‘.bz2’, ‘.gz’, ‘.zip’, or ‘.xz’ (otherwise no decompression). Now the profiler can as well. Co-authored-by: Davin Shearer <scholarsmate@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
akshay-sarbhukan-aera · Jan 5, 2021 · 64ad4cc · 64ad4cc
1 parent 1e086f4
commit 64ad4cc
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 6 deletions.
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,12 +1,11 @@
 pytest
 coverage<5
 codecov
-pytest-mypy>=0.7.0
+pytest-mypy
 pytest-cov
-nbval>=0.9.6
-ipykernel>=5.3.4
+nbval
 fastparquet==0.4.1
 flake8
-check-manifest>=0.45
+check-manifest>=0.41
 twine>=3.1.1
 kaggle
diff --git a/src/pandas_profiling/utils/dataframe.py b/src/pandas_profiling/utils/dataframe.py
@@ -21,6 +21,55 @@ def warn_read(extension):
     )
 
 
+def is_supported_compression(file_extension: str) -> bool:
+    """Determine if the given file extension indicates a compression format that pandas can handle automatically.
+
+    Args:
+        file_extension (str): the file extension to test
+
+    Returns:
+        bool: True if the extension indicates a compression format that pandas handles automatically and False otherwise
+
+    Notes:
+        Pandas can handle on the fly decompression from the following extensions: ‘.bz2’, ‘.gz’, ‘.zip’, or ‘.xz’
+        (otherwise no decompression). If using ‘.zip’, the ZIP file must contain exactly one data file to be read in.
+    """
+    return file_extension.lower() in [".bz2", ".gz", ".xz", ".zip"]
+
+
+def remove_suffix(text: str, suffix: str) -> str:
+    """Removes the given suffix from the given string.
+
+    Args:
+        text (str): the string to remove the suffix from
+        suffix (str): the suffix to remove from the string
+
+    Returns:
+        str: the string with the suffix removed, if the string ends with the suffix, otherwise the unmodified string
+
+    Notes:
+        In python 3.9+, there is a built-in string method called removesuffix() that can serve this purpose.
+    """
+    return text[: -len(suffix)] if suffix and text.endswith(suffix) else text
+
+
+def uncompressed_extension(file_name: Path) -> str:
+    """Returns the uncompressed extension of the given file name.
+
+    Args:
+        file_name (Path): the file name to get the uncompressed extension of
+
+    Returns:
+        str: the uncompressed extension, or the original extension if pandas doesn't handle it automatically
+    """
+    extension = file_name.suffix.lower()
+    return (
+        Path(remove_suffix(str(file_name).lower(), extension)).suffix
+        if is_supported_compression(extension)
+        else extension
+    )
+
+
 def read_pandas(file_name: Path) -> pd.DataFrame:
     """Read DataFrame based on the file extension. This function is used when the file is in a standard format.
     Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)
@@ -40,7 +89,7 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
         user input, which is currently used in the editor integration. For more advanced use cases, the user should load
         the DataFrame in code.
     """
-    extension = file_name.suffix.lower()
+    extension = uncompressed_extension(file_name)
     if extension == ".json":
         df = pd.read_json(str(file_name))
     elif extension == ".jsonl":
@@ -59,6 +108,10 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
         df = pd.read_parquet(str(file_name))
     elif extension in [".pkl", ".pickle"]:
         df = pd.read_pickle(str(file_name))
+    elif extension == ".tar":
+        raise ValueError(
+            "tar compression is not supported directly by pandas, please use the 'tarfile' module"
+        )
     else:
         if extension != ".csv":
             warn_read(extension)

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -3,7 +3,12 @@
 import pandas as pd
 import pytest
 
-from pandas_profiling.utils.dataframe import expand_mixed, read_pandas, warn_read
+from pandas_profiling.utils.dataframe import (
+    expand_mixed,
+    read_pandas,
+    uncompressed_extension,
+    warn_read,
+)
 
 
 def test_read_pandas_parquet():
@@ -47,3 +52,13 @@ def test_expand():
     df = pd.DataFrame(data=[{"name": "John", "age": 30}, {"name": "Alice", "age": 25}])
     expanded_df = expand_mixed(df)
     assert expanded_df.shape == (2, 2)
+
+
+def test_remove_compression_ext():
+    assert uncompressed_extension(Path("dataset.csv.gz")) == ".csv"
+    assert uncompressed_extension(Path("dataset.tsv.xz")) == ".tsv"
+
+
+def test_remove_unsupported_ext():
+    with pytest.raises(ValueError):
+        read_pandas(Path("dataset.json.tar.gz"))