Skip to content

Commit

Permalink
Fixes (#25)
Browse files Browse the repository at this point in the history
- version
- pyupgrade
- deep=True by default
- fix test
- Add size test
  • Loading branch information
sbrugman authored Oct 13, 2020
1 parent d9aefa9 commit d6795a8
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 28 deletions.
7 changes: 6 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,9 @@ repos:
- repo: https://github.com/timothycrosley/isort
rev: 5.4.2
hooks:
- id: isort
- id: isort
- repo: https://github.com/asottile/pyupgrade
rev: v2.7.2
hooks:
- id: pyupgrade
args: [--py36-plus]
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
with (source_root / "requirements_test.txt").open(encoding="utf8") as f:
test_requirements = f.readlines()

__version__ = None
with (source_root / "src/compressio/version.py").open(encoding="utf8") as f:
exec(f.read())


setup(
name="compressio",
version="0.1.3",
version=__version__,
url="https://github.com/dylan-profiler/compressio",
description="compressio",
author="Ian Eaves, Simon Brugman",
Expand Down
3 changes: 3 additions & 0 deletions src/compressio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
SparseCompressor,
)

from .version import __version__

__all__ = [
"Compress",
"storage_size",
Expand All @@ -22,4 +24,5 @@
"DefaultCompressor",
"SparseCompressor",
"type_compressions",
"__version__",
]
6 changes: 3 additions & 3 deletions src/compressio/compression_algorithms/type_compressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def compress_sparse_missing(series: pd.Series) -> pd.Series:
series[series.notnull()], dtype=test_dtype, fill_value=fill_value
)
)
if new_series.memory_usage() < series.memory_usage():
if new_series.memory_usage(deep=True) < series.memory_usage(deep=True):
return new_series

return series
Expand Down Expand Up @@ -127,7 +127,7 @@ def compress_complex(series: pd.Series) -> pd.Series:
def compress_object(series: pd.Series) -> pd.Series:
try:
new_series = series.astype("category")
if new_series.memory_usage() < series.memory_usage():
if new_series.memory_usage(deep=True) < series.memory_usage(deep=True):
return new_series
except: # noqa
pass
Expand All @@ -137,7 +137,7 @@ def compress_object(series: pd.Series) -> pd.Series:
def compress_datetime(series: pd.Series) -> pd.Series:
try:
new_series = series.astype("category")
if new_series.memory_usage() < series.memory_usage():
if new_series.memory_usage(deep=True) < series.memory_usage(deep=True):
return new_series
except: # noqa
pass
Expand Down
27 changes: 11 additions & 16 deletions src/compressio/diagnostics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
"""Work in progress"""
from functools import singledispatch

import pandas as pd
Expand All @@ -11,18 +10,18 @@


@singledispatch
def storage_size(data: pdT, deep=True) -> Quantity:
def storage_size(data: pdT) -> Quantity:
raise TypeError(f"Can't compute memory size of objects with type {type(data)}")


@storage_size.register(pd.Series) # type: ignore
def _(data: pd.Series, deep) -> Quantity:
return Quantity(value=data.memory_usage(deep=deep), units="byte")
def _(data: pd.Series) -> Quantity:
return Quantity(value=data.memory_usage(deep=True), units="byte")


@storage_size.register(pd.DataFrame) # type: ignore
def _(data: pd.DataFrame, deep=False) -> Quantity:
return Quantity(value=data.memory_usage(deep=deep).sum(), units="byte")
def _(data: pd.DataFrame) -> Quantity:
return Quantity(value=data.memory_usage(deep=True).sum(), units="byte")


@singledispatch
Expand Down Expand Up @@ -65,19 +64,15 @@ def _(
compress_report(data[column], typeset, compressor, with_inference, units)


def savings(
original_data: pdT, new_data: pdT, units="megabyte", deep=False
) -> Quantity:
original_size = storage_size(original_data, deep)
new_size = storage_size(new_data, deep)
def savings(original_data: pdT, new_data: pdT, units: str = "megabyte",) -> Quantity:
original_size = storage_size(original_data)
new_size = storage_size(new_data)
return (original_size - new_size).to(units)


def savings_report(
original_data: pdT, new_data: pdT, units="megabyte", deep=False
) -> None:
original_size = storage_size(original_data, deep).to(units)
new_size = storage_size(new_data, deep).to(units)
def savings_report(original_data: pdT, new_data: pdT, units: str = "megabyte",) -> None:
original_size = storage_size(original_data).to(units)
new_size = storage_size(new_data).to(units)
reduction = original_size - new_size
print(f"Original size: {original_size}")
print(f"Compressed size: {new_size}")
Expand Down
1 change: 1 addition & 0 deletions src/compressio/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.1.4"
1 change: 0 additions & 1 deletion tests/test_compression_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pandas as pd
import pytest
from pandas.testing import assert_series_equal
from visions import StandardSet

from compressio.compression_algorithms import (
compress_complex,
Expand Down
10 changes: 4 additions & 6 deletions tests/test_compression_func_default.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
import pandas as pd
import pytest
import visions
from pandas.testing import assert_series_equal
from visions import StandardSet

Expand All @@ -17,9 +16,9 @@
(
pd.Series([10.0, 100.0, np.iinfo(np.int16).max * 1.0], dtype=np.float64),
np.float64,
np.int16,
"int16",
),
(pd.Series([np.nan, 1], dtype=np.float64), np.float64, pd.Int8Dtype),
(pd.Series([np.nan, 1], dtype=np.float64), np.float64, "Int8"),
(
pd.Series([True, False, None, None, None, None, True, False] * 1000),
np.object,
Expand All @@ -35,8 +34,7 @@ def test_compress_series(series, before, expected):
compressor=DefaultCompressor(),
with_inference=True,
)
assert compressed_series.dtype == expected or isinstance(
compressed_series.dtype, expected
)

assert str(compressed_series.dtype) == expected

assert_series_equal(series, compressed_series, check_dtype=False)
10 changes: 10 additions & 0 deletions tests/test_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pandas as pd
from compressio import storage_size, savings


def test_size_formatting():
series1 = pd.Series([True, False] * 10000, dtype=bool)
series2 = pd.Series([True, False] * 5000, dtype=bool)
assert series1.memory_usage(deep=True) == 20128
assert str(storage_size(series1)) == "20128 byte"
assert str(savings(series1, series2)) == "0.01 megabyte"

0 comments on commit d6795a8

Please sign in to comment.