Skip to content

Commit

Permalink
FIX: Fix lgtm alerts (#22)
Browse files Browse the repository at this point in the history
* Fix lgtm alerts
* Backwards compatibility pandas
  • Loading branch information
sbrugman authored Sep 23, 2020
1 parent d7ff488 commit 2fb1a5f
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 35 deletions.
3 changes: 2 additions & 1 deletion src/compressio/compression_algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
compress_float,
compress_integer,
compress_object,
compress_sparse,
compress_sparse_missing,
)

__all__ = [
Expand All @@ -15,4 +15,5 @@
"compress_float",
"compress_integer",
"compress_object",
"compress_sparse_missing",
]
24 changes: 16 additions & 8 deletions src/compressio/compression_algorithms/type_compressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import numpy as np
import pandas as pd
from pandas import CategoricalDtype

nan_value = pd.NA if hasattr(pd, "NA") else np.nan


def type_tester(
Expand All @@ -22,7 +23,7 @@ def get_compressed_type(
return next(test_sequence)


def compress_sparse(series: pd.Series) -> pd.Series:
def compress_sparse_missing(series: pd.Series) -> pd.Series:
"""Compresses the data by using the SparseArray data structure for missing values/nans
:param series: series to compress
Expand All @@ -37,19 +38,26 @@ def compress_sparse(series: pd.Series) -> pd.Series:

# pandas dtypes
if pd.api.types.is_extension_array_dtype(test_dtype):
if test_dtype != CategoricalDtype():
test_dtype = test_dtype.numpy_dtype
fill_value = pd.NA
if test_dtype != pd.CategoricalDtype():
if hasattr(test_dtype, "numpy_dtype"):
test_dtype = test_dtype.numpy_dtype
elif hasattr(test_dtype, "type"):
test_dtype = test_dtype.type
else:
raise ValueError(f"Couldn't obtain the dtype of {type(test_dtype)}")
fill_value = nan_value
else:
test_dtype = np.object

new_series = pd.Series(
pd.arrays.SparseArray(series, dtype=test_dtype, fill_value=fill_value)
pd.arrays.SparseArray(
series[series.notnull()], dtype=test_dtype, fill_value=fill_value
)
)
if new_series.memory_usage() < series.memory_usage():
return new_series
else:
return series

return series


def compress_float(series: pd.Series) -> pd.Series:
Expand Down
14 changes: 7 additions & 7 deletions src/compressio/type_compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
compress_float,
compress_integer,
compress_object,
compress_sparse,
compress_sparse_missing,
)
from compressio.utils import compose


@singledispatch
def parse_func(f):
raise NotImplemented
raise NotImplementedError


@parse_func.register(object)
Expand Down Expand Up @@ -65,13 +65,13 @@ def __init__(self, *args, **kwargs):
class SparseCompressor(BaseTypeCompressor):
def __init__(self, *args, **kwargs):
compression_map = {
Integer: [compress_sparse, compress_integer],
Float: [compress_sparse, compress_float],
Complex: [compress_sparse, compress_complex],
Integer: [compress_sparse_missing, compress_integer],
Float: [compress_sparse_missing, compress_float],
Complex: [compress_sparse_missing, compress_complex],
Object: compress_object,
Boolean: compress_sparse,
Boolean: compress_sparse_missing,
# Pending https://github.com/pandas-dev/pandas/issues/35762
DateTime: compress_datetime,
String: [compress_sparse, compress_object],
String: [compress_sparse_missing, compress_object],
}
super().__init__(compression_map, *args, **kwargs)
2 changes: 1 addition & 1 deletion tests/test_compression_algorithms.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas._testing import assert_series_equal
from pandas.testing import assert_series_equal
from visions import StandardSet

from compressio.compression_algorithms import (
Expand Down
13 changes: 3 additions & 10 deletions tests/test_compression_func_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,13 @@
import pandas as pd
import pytest
import visions
from pandas._testing import assert_series_equal
from pandas.testing import assert_series_equal
from visions import StandardSet

from compressio.compress import compress_func
from compressio.type_compressor import DefaultCompressor

if hasattr(visions, "BoolDtype"):
from visions import BoolDtype

bool_type = BoolDtype
elif hasattr(pd, "BooleanDtype"):
bool_type = pd.BooleanDtype
else:
raise RuntimeError("No boolean Dtype found. Please update visions/pandas")
bool_dtype = "boolean" if int(pd.__version__.split(".")[0]) >= 1 else "Bool"


@pytest.mark.parametrize(
Expand All @@ -30,7 +23,7 @@
(
pd.Series([True, False, None, None, None, None, True, False] * 1000),
np.object,
bool_type,
bool_dtype,
),
],
)
Expand Down
17 changes: 9 additions & 8 deletions tests/test_compression_func_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from compressio.compress import compress_func
from compressio.type_compressor import SparseCompressor

nan_value = pd.NA if hasattr(pd, "NA") else np.nan
bool_dtype = "boolean" if int(pd.__version__.split(".")[0]) >= 1 else "Bool"


@pytest.mark.parametrize(
"series,before,expected,inference",
Expand All @@ -27,7 +30,7 @@
dtype="Int64",
),
pd.Int64Dtype(),
pd.SparseDtype(np.int8, pd.NA),
pd.SparseDtype(np.int8, nan_value),
True,
),
(
Expand All @@ -41,7 +44,7 @@
),
(
pd.Series(
[pd.NA] * 100 + [0] * 9000 + [1] * 500 + [2] * 400 + [3] * 10,
[nan_value] * 100 + [0] * 9000 + [1] * 500 + [2] * 400 + [3] * 10,
dtype="Int64",
),
pd.Int64Dtype(),
Expand All @@ -68,17 +71,17 @@
),
(
pd.Series(
[pd.NA] * 10000 + random.choices(["gold", "black", "silver"], k=10),
[nan_value] * 10000 + random.choices(["gold", "black", "silver"], k=10),
dtype=str,
),
np.object,
pd.SparseDtype("object", np.nan),
False,
),
(
pd.Series([pd.NA] * 10000 + [True, False], dtype="boolean"),
pd.BooleanDtype(),
pd.SparseDtype(bool, pd.NA),
pd.Series([nan_value] * 10000 + [True, False], dtype=bool_dtype),
bool_dtype,
pd.SparseDtype(bool, nan_value),
False,
),
],
Expand All @@ -92,5 +95,3 @@ def test_compress_series(series, before, expected, inference):
with_inference=inference,
)
assert compressed_series.dtype == expected

# TODO: assert values

0 comments on commit 2fb1a5f

Please sign in to comment.