Skip to content

Commit

Permalink
ENH: add groupby & reduce support to EA (pandas-dev#22762)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback authored and tm9k1 committed Nov 19, 2018
1 parent 19ca934 commit b843388
Show file tree
Hide file tree
Showing 18 changed files with 269 additions and 31 deletions.
10 changes: 9 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l
Here is an example of the usage.

We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`)
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`)

.. ipython:: python

Expand Down Expand Up @@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted.
pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
df['A'].astype(float)

Reduction and groupby operations such as 'sum' work.

.. ipython:: python

df.sum()
df.groupby('B').A.sum()

.. warning::

The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date.
Expand Down Expand Up @@ -567,6 +574,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)

.. _whatsnew_0240.api.incompatibilities:

Expand Down
24 changes: 24 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,30 @@ def all_arithmetic_operators(request):
return request.param


_all_numeric_reductions = ['sum', 'max', 'min',
'mean', 'prod', 'std', 'var', 'median',
'kurt', 'skew']


@pytest.fixture(params=_all_numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
"""
return request.param


_all_boolean_reductions = ['all', 'any']


@pytest.fixture(params=_all_boolean_reductions)
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names
"""
return request.param


_cython_table = pd.core.base.SelectionMixin._cython_table.items()


Expand Down
31 changes: 31 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ class ExtensionArray(object):
as they only compose abstract methods. Still, a more efficient
implementation may be available, and these methods can be overridden.
One can implement methods to handle array reductions.
* _reduce
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
Expand Down Expand Up @@ -675,6 +679,33 @@ def _ndarray_values(self):
"""
return np.array(self)

def _reduce(self, name, skipna=True, **kwargs):
"""
Return a scalar result of performing the reduction operation.
Parameters
----------
name : str
Name of the function, supported values are:
{ any, all, min, max, sum, mean, median, prod,
std, var, sem, kurt, skew }.
skipna : bool, default True
If True, skip NaN values.
**kwargs
Additional keyword arguments passed to the reduction function.
Currently, `ddof` is the only supported kwarg.
Returns
-------
scalar
Raises
------
TypeError : subclass does not define reductions
"""
raise TypeError("cannot perform {name} with type {dtype}".format(
name=name, dtype=self.dtype))


class ExtensionOpsMixin(object):
"""
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2069,14 +2069,12 @@ def _reverse_indexer(self):
return result

# reduction ops #
def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwds):
""" perform the reduction type operation """
def _reduce(self, name, axis=0, skipna=True, **kwargs):
func = getattr(self, name, None)
if func is None:
msg = 'Categorical cannot perform the operation {op}'
raise TypeError(msg.format(op=name))
return func(numeric_only=numeric_only, **kwds)
return func(**kwargs)

def min(self, numeric_only=None, **kwargs):
""" The minimum value of the object.
Expand Down
26 changes: 26 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pandas.compat import u, range, string_types
from pandas.compat import set_function_name

from pandas.core import nanops
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -529,6 +530,31 @@ def cmp_method(self, other):
name = '__{name}__'.format(name=op.__name__)
return set_function_name(cmp_method, name, cls)

def _reduce(self, name, skipna=True, **kwargs):
data = self._data
mask = self._mask

# coerce to a nan-aware float if needed
if mask.any():
data = self._data.astype('float64')
data[mask] = self._na_value

op = getattr(nanops, 'nan' + name)
result = op(data, axis=0, skipna=skipna, mask=mask)

# if we have a boolean op, don't coerce
if name in ['any', 'all']:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
int_result = int(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name):
"""
Parameters
Expand Down
17 changes: 13 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3392,16 +3392,25 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
"""
delegate = self._values
if isinstance(delegate, np.ndarray):
# Validate that 'axis' is consistent with Series's single axis.
if axis is not None:
self._get_axis_number(axis)

if axis is not None:
self._get_axis_number(axis)

# dispatch to ExtensionArray interface
if isinstance(delegate, ExtensionArray):
return delegate._reduce(name, skipna=skipna, **kwds)

# dispatch to numpy arrays
elif isinstance(delegate, np.ndarray):
if numeric_only:
raise NotImplementedError('Series.{0} does not implement '
'numeric_only.'.format(name))
with np.errstate(all='ignore'):
return op(delegate, skipna=skipna, **kwds)

# TODO(EA) dispatch to Index
# remove once all internals extension types are
# moved to ExtensionArrays
return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,
numeric_only=numeric_only,
filter_type=filter_type, **kwds)
Expand Down
45 changes: 41 additions & 4 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ def _check_op(self, s, op_name, other, exc=None):
# compute expected
mask = s.isna()

# if s is a DataFrame, squeeze to a Series
# for comparison
if isinstance(s, pd.DataFrame):
result = result.squeeze()
s = s.squeeze()
mask = mask.squeeze()

# other array is an Integer
if isinstance(other, IntegerArray):
omask = getattr(other, 'mask', None)
Expand Down Expand Up @@ -215,7 +222,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
s = pd.Series(data)
self._check_op(s, op, 1, exc=TypeError)

@pytest.mark.xfail(run=False, reason="_reduce needs implementation")
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
# frame & scalar
op = all_arithmetic_operators
Expand Down Expand Up @@ -587,22 +593,53 @@ def test_cross_type_arithmetic():
tm.assert_series_equal(result, expected)


def test_groupby_mean_included():
@pytest.mark.parametrize('op', ['sum', 'min', 'max', 'prod'])
def test_preserve_dtypes(op):
# TODO(#22346): preserve Int64 dtype
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame({
"A": ['a', 'b', 'b'],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype='Int64'),
})

result = df.groupby("A").sum()
# TODO(#22346): preserve Int64 dtype
# op
result = getattr(df.C, op)()
assert isinstance(result, int)

# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize('op', ['mean'])
def test_reduce_to_float(op):
# some reduce ops always return float, even if the result
# is a rounded number
df = pd.DataFrame({
"A": ['a', 'b', 'b'],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype='Int64'),
})

# op
result = getattr(df.C, op)()
assert isinstance(result, float)

# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="float64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)


def test_astype_nansafe():
# https://github.com/pandas-dev/pandas/pull/22343
arr = integer_array([np.nan, 1, 2], dtype="Int8")
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,8 @@ def test_is_datetime_or_timedelta_dtype():
assert not com.is_datetime_or_timedelta_dtype(str)
assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b']))
assert not com.is_datetime_or_timedelta_dtype(
DatetimeTZDtype("ns", "US/Eastern"))

assert com.is_datetime_or_timedelta_dtype(np.datetime64)
assert com.is_datetime_or_timedelta_dtype(np.timedelta64)
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/arrow/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def test_from_dtype(self, data):
pytest.skip("GH-22666")


class TestReduce(base.BaseNoReduceTests):
pass


def test_is_bool_dtype(data):
assert pd.api.types.is_bool_dtype(data)
assert pd.core.common.is_bool_indexer(data)
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests):
from .interface import BaseInterfaceTests # noqa
from .methods import BaseMethodsTests # noqa
from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa
from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests # noqa
from .missing import BaseMissingTests # noqa
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa
8 changes: 4 additions & 4 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
"B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")

index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
Expand All @@ -39,8 +39,8 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
"B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")

index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)

Expand Down
58 changes: 58 additions & 0 deletions pandas/tests/extension/base/reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import warnings
import pytest
import pandas.util.testing as tm
import pandas as pd
from .base import BaseExtensionTests


class BaseReduceTests(BaseExtensionTests):
"""
Reduction specific tests. Generally these only
make sense for numeric/boolean operations.
"""
def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype('float64'), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


class BaseNoReduceTests(BaseReduceTests):
""" we don't define any reductions """

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
s = pd.Series(data)

with pytest.raises(TypeError):
getattr(s, op_name)(skipna=skipna)

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)

with pytest.raises(TypeError):
getattr(s, op_name)(skipna=skipna)


class BaseNumericReduceTests(BaseReduceTests):

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
s = pd.Series(data)

# min/max with empty produce numpy warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
self.check_reduce(s, op_name, skipna)


class BaseBooleanReduceTests(BaseReduceTests):

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)
self.check_reduce(s, op_name, skipna)
Loading

0 comments on commit b843388

Please sign in to comment.