diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b012e3..0abe215 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,4 +11,9 @@ repos: - repo: https://github.com/timothycrosley/isort rev: 5.4.2 hooks: - - id: isort \ No newline at end of file + - id: isort +- repo: https://github.com/asottile/pyupgrade + rev: v2.7.2 + hooks: + - id: pyupgrade + args: [--py36-plus] \ No newline at end of file diff --git a/setup.py b/setup.py index 7b175fa..5c570cc 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,14 @@ with (source_root / "requirements_test.txt").open(encoding="utf8") as f: test_requirements = f.readlines() +__version__ = None +with (source_root / "src/compressio/version.py").open(encoding="utf8") as f: + exec(f.read()) + setup( name="compressio", - version="0.1.3", + version=__version__, url="https://github.com/dylan-profiler/compressio", description="compressio", author="Ian Eaves, Simon Brugman", diff --git a/src/compressio/__init__.py b/src/compressio/__init__.py index e01e64e..4cb6a09 100644 --- a/src/compressio/__init__.py +++ b/src/compressio/__init__.py @@ -12,6 +12,8 @@ SparseCompressor, ) +from .version import __version__ + __all__ = [ "Compress", "storage_size", @@ -22,4 +24,5 @@ "DefaultCompressor", "SparseCompressor", "type_compressions", + "__version__", ] diff --git a/src/compressio/compression_algorithms/type_compressions.py b/src/compressio/compression_algorithms/type_compressions.py index 80386b5..36906e8 100644 --- a/src/compressio/compression_algorithms/type_compressions.py +++ b/src/compressio/compression_algorithms/type_compressions.py @@ -54,7 +54,7 @@ def compress_sparse_missing(series: pd.Series) -> pd.Series: series[series.notnull()], dtype=test_dtype, fill_value=fill_value ) ) - if new_series.memory_usage() < series.memory_usage(): + if new_series.memory_usage(deep=True) < series.memory_usage(deep=True): return new_series return series @@ -127,7 +127,7 @@ def compress_complex(series: pd.Series) -> pd.Series: def compress_object(series: pd.Series) -> pd.Series: try: new_series = series.astype("category") - if new_series.memory_usage() < series.memory_usage(): + if new_series.memory_usage(deep=True) < series.memory_usage(deep=True): return new_series except: # noqa pass @@ -137,7 +137,7 @@ def compress_object(series: pd.Series) -> pd.Series: def compress_datetime(series: pd.Series) -> pd.Series: try: new_series = series.astype("category") - if new_series.memory_usage() < series.memory_usage(): + if new_series.memory_usage(deep=True) < series.memory_usage(deep=True): return new_series except: # noqa pass diff --git a/src/compressio/diagnostics.py b/src/compressio/diagnostics.py index c1bee50..9b4458d 100644 --- a/src/compressio/diagnostics.py +++ b/src/compressio/diagnostics.py @@ -1,4 +1,3 @@ -"""Work in progress""" from functools import singledispatch import pandas as pd @@ -11,18 +10,18 @@ @singledispatch -def storage_size(data: pdT, deep=True) -> Quantity: +def storage_size(data: pdT) -> Quantity: raise TypeError(f"Can't compute memory size of objects with type {type(data)}") @storage_size.register(pd.Series) # type: ignore -def _(data: pd.Series, deep) -> Quantity: - return Quantity(value=data.memory_usage(deep=deep), units="byte") +def _(data: pd.Series) -> Quantity: + return Quantity(value=data.memory_usage(deep=True), units="byte") @storage_size.register(pd.DataFrame) # type: ignore -def _(data: pd.DataFrame, deep=False) -> Quantity: - return Quantity(value=data.memory_usage(deep=deep).sum(), units="byte") +def _(data: pd.DataFrame) -> Quantity: + return Quantity(value=data.memory_usage(deep=True).sum(), units="byte") @singledispatch @@ -65,19 +64,15 @@ def _( compress_report(data[column], typeset, compressor, with_inference, units) -def savings( - original_data: pdT, new_data: pdT, units="megabyte", deep=False -) -> Quantity: - original_size = storage_size(original_data, deep) - new_size = storage_size(new_data, deep) +def savings(original_data: pdT, new_data: pdT, units: str = "megabyte",) -> Quantity: + original_size = storage_size(original_data) + new_size = storage_size(new_data) return (original_size - new_size).to(units) -def savings_report( - original_data: pdT, new_data: pdT, units="megabyte", deep=False -) -> None: - original_size = storage_size(original_data, deep).to(units) - new_size = storage_size(new_data, deep).to(units) +def savings_report(original_data: pdT, new_data: pdT, units: str = "megabyte",) -> None: + original_size = storage_size(original_data).to(units) + new_size = storage_size(new_data).to(units) reduction = original_size - new_size print(f"Original size: {original_size}") print(f"Compressed size: {new_size}") diff --git a/src/compressio/version.py b/src/compressio/version.py new file mode 100644 index 0000000..bbab024 --- /dev/null +++ b/src/compressio/version.py @@ -0,0 +1 @@ +__version__ = "0.1.4" diff --git a/tests/test_compression_algorithms.py b/tests/test_compression_algorithms.py index 7821fac..cee0de3 100644 --- a/tests/test_compression_algorithms.py +++ b/tests/test_compression_algorithms.py @@ -2,7 +2,6 @@ import pandas as pd import pytest from pandas.testing import assert_series_equal -from visions import StandardSet from compressio.compression_algorithms import ( compress_complex, diff --git a/tests/test_compression_func_default.py b/tests/test_compression_func_default.py index d2562cd..bf812cc 100644 --- a/tests/test_compression_func_default.py +++ b/tests/test_compression_func_default.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest -import visions from pandas.testing import assert_series_equal from visions import StandardSet @@ -17,9 +16,9 @@ ( pd.Series([10.0, 100.0, np.iinfo(np.int16).max * 1.0], dtype=np.float64), np.float64, - np.int16, + "int16", ), - (pd.Series([np.nan, 1], dtype=np.float64), np.float64, pd.Int8Dtype), + (pd.Series([np.nan, 1], dtype=np.float64), np.float64, "Int8"), ( pd.Series([True, False, None, None, None, None, True, False] * 1000), np.object, @@ -35,8 +34,7 @@ def test_compress_series(series, before, expected): compressor=DefaultCompressor(), with_inference=True, ) - assert compressed_series.dtype == expected or isinstance( - compressed_series.dtype, expected - ) + + assert str(compressed_series.dtype) == expected assert_series_equal(series, compressed_series, check_dtype=False) diff --git a/tests/test_size.py b/tests/test_size.py new file mode 100644 index 0000000..7f8d9d0 --- /dev/null +++ b/tests/test_size.py @@ -0,0 +1,10 @@ +import pandas as pd +from compressio import storage_size, savings + + +def test_size_formatting(): + series1 = pd.Series([True, False] * 10000, dtype=bool) + series2 = pd.Series([True, False] * 5000, dtype=bool) + assert series1.memory_usage(deep=True) == 20128 + assert str(storage_size(series1)) == "20128 byte" + assert str(savings(series1, series2)) == "0.01 megabyte"