From b843388dd4b8c5f6c9c9dc959c671629bcfe0f2f Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Fri, 12 Oct 2018 08:19:57 -0400
Subject: [PATCH] ENH: add groupby & reduce support to EA (#22762)

---
 doc/source/whatsnew/v0.24.0.txt               | 10 +++-
 pandas/conftest.py                            | 24 ++++++++
 pandas/core/arrays/base.py                    | 31 ++++++++++
 pandas/core/arrays/categorical.py             |  6 +-
 pandas/core/arrays/integer.py                 | 26 +++++++++
 pandas/core/series.py                         | 17 ++++--
 pandas/tests/arrays/test_integer.py           | 45 ++++++++++++--
 pandas/tests/dtypes/test_common.py            |  2 +
 pandas/tests/extension/arrow/test_bool.py     |  4 ++
 pandas/tests/extension/base/__init__.py       |  1 +
 pandas/tests/extension/base/groupby.py        |  8 +--
 pandas/tests/extension/base/reduce.py         | 58 +++++++++++++++++++
 pandas/tests/extension/decimal/array.py       | 12 ++++
 .../tests/extension/decimal/test_decimal.py   | 22 +++++++
 pandas/tests/extension/json/test_json.py      |  4 ++
 pandas/tests/extension/test_categorical.py    |  4 ++
 pandas/tests/extension/test_integer.py        | 22 +++----
 pandas/tests/extension/test_interval.py       |  4 ++
 18 files changed, 269 insertions(+), 31 deletions(-)
 create mode 100644 pandas/tests/extension/base/reduce.py

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 420711e469c743..81c517c956e77a 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l
 Here is an example of the usage.
 
 We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value
-marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`)
+marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`)
 
 .. ipython:: python
 
@@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted.
    pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
    df['A'].astype(float)
 
+Reduction and groupby operations such as 'sum' work.
+
+.. ipython:: python
+
+   df.sum()
+   df.groupby('B').A.sum()
+
 .. warning::
 
    The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date.
@@ -567,6 +574,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
 - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
 - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
+- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
 
 .. _whatsnew_0240.api.incompatibilities:
 
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 621de3ffd4b12c..e84657a79b51ae 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -131,6 +131,30 @@ def all_arithmetic_operators(request):
     return request.param
 
 
+_all_numeric_reductions = ['sum', 'max', 'min',
+                           'mean', 'prod', 'std', 'var', 'median',
+                           'kurt', 'skew']
+
+
+@pytest.fixture(params=_all_numeric_reductions)
+def all_numeric_reductions(request):
+    """
+    Fixture for numeric reduction names
+    """
+    return request.param
+
+
+_all_boolean_reductions = ['all', 'any']
+
+
+@pytest.fixture(params=_all_boolean_reductions)
+def all_boolean_reductions(request):
+    """
+    Fixture for boolean reduction names
+    """
+    return request.param
+
+
 _cython_table = pd.core.base.SelectionMixin._cython_table.items()
 
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 627afd1b6f8600..ef7e25033f24e3 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -63,6 +63,10 @@ class ExtensionArray(object):
     as they only compose abstract methods. Still, a more efficient
     implementation may be available, and these methods can be overridden.
 
+    One can implement methods to handle array reductions.
+
+    * _reduce
+
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
     ``pandas.errors.AbstractMethodError`` and no ``register`` method is
@@ -675,6 +679,33 @@ def _ndarray_values(self):
         """
         return np.array(self)
 
+    def _reduce(self, name, skipna=True, **kwargs):
+        """
+        Return a scalar result of performing the reduction operation.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            { any, all, min, max, sum, mean, median, prod,
+            std, var, sem, kurt, skew }.
+        skipna : bool, default True
+            If True, skip NaN values.
+        **kwargs
+            Additional keyword arguments passed to the reduction function.
+            Currently, `ddof` is the only supported kwarg.
+
+        Returns
+        -------
+        scalar
+
+        Raises
+        ------
+        TypeError : subclass does not define reductions
+        """
+        raise TypeError("cannot perform {name} with type {dtype}".format(
+            name=name, dtype=self.dtype))
+
 
 class ExtensionOpsMixin(object):
     """
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 216bccf7d63093..79070bbbfd11ab 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2069,14 +2069,12 @@ def _reverse_indexer(self):
         return result
 
     # reduction ops #
-    def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
-                filter_type=None, **kwds):
-        """ perform the reduction type operation """
+    def _reduce(self, name, axis=0, skipna=True, **kwargs):
         func = getattr(self, name, None)
         if func is None:
             msg = 'Categorical cannot perform the operation {op}'
             raise TypeError(msg.format(op=name))
-        return func(numeric_only=numeric_only, **kwds)
+        return func(**kwargs)
 
     def min(self, numeric_only=None, **kwargs):
         """ The minimum value of the object.
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index e58109a25e1a57..9917045f2f7d2a 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -8,6 +8,7 @@
 from pandas.compat import u, range, string_types
 from pandas.compat import set_function_name
 
+from pandas.core import nanops
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
 from pandas.core.dtypes.common import (
@@ -529,6 +530,31 @@ def cmp_method(self, other):
         name = '__{name}__'.format(name=op.__name__)
         return set_function_name(cmp_method, name, cls)
 
+    def _reduce(self, name, skipna=True, **kwargs):
+        data = self._data
+        mask = self._mask
+
+        # coerce to a nan-aware float if needed
+        if mask.any():
+            data = self._data.astype('float64')
+            data[mask] = self._na_value
+
+        op = getattr(nanops, 'nan' + name)
+        result = op(data, axis=0, skipna=skipna, mask=mask)
+
+        # if we have a boolean op, don't coerce
+        if name in ['any', 'all']:
+            pass
+
+        # if we have a preservable numeric op,
+        # provide coercion back to an integer type if possible
+        elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
+            int_result = int(result)
+            if int_result == result:
+                result = int_result
+
+        return result
+
     def _maybe_mask_result(self, result, mask, other, op_name):
         """
         Parameters
diff --git a/pandas/core/series.py b/pandas/core/series.py
index a613b22ea90467..bff0f9fe255324 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -3392,16 +3392,25 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
 
         """
         delegate = self._values
-        if isinstance(delegate, np.ndarray):
-            # Validate that 'axis' is consistent with Series's single axis.
-            if axis is not None:
-                self._get_axis_number(axis)
+
+        if axis is not None:
+            self._get_axis_number(axis)
+
+        # dispatch to ExtensionArray interface
+        if isinstance(delegate, ExtensionArray):
+            return delegate._reduce(name, skipna=skipna, **kwds)
+
+        # dispatch to numpy arrays
+        elif isinstance(delegate, np.ndarray):
             if numeric_only:
                 raise NotImplementedError('Series.{0} does not implement '
                                           'numeric_only.'.format(name))
             with np.errstate(all='ignore'):
                 return op(delegate, skipna=skipna, **kwds)
 
+        # TODO(EA) dispatch to Index
+        # remove once all internals extension types are
+        # moved to ExtensionArrays
         return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,
                                 numeric_only=numeric_only,
                                 filter_type=filter_type, **kwds)
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
index 349a6aee5701ea..23ee8d217bd590 100644
--- a/pandas/tests/arrays/test_integer.py
+++ b/pandas/tests/arrays/test_integer.py
@@ -114,6 +114,13 @@ def _check_op(self, s, op_name, other, exc=None):
         # compute expected
         mask = s.isna()
 
+        # if s is a DataFrame, squeeze to a Series
+        # for comparison
+        if isinstance(s, pd.DataFrame):
+            result = result.squeeze()
+            s = s.squeeze()
+            mask = mask.squeeze()
+
         # other array is an Integer
         if isinstance(other, IntegerArray):
             omask = getattr(other, 'mask', None)
@@ -215,7 +222,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
         s = pd.Series(data)
         self._check_op(s, op, 1, exc=TypeError)
 
-    @pytest.mark.xfail(run=False, reason="_reduce needs implementation")
     def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
         # frame & scalar
         op = all_arithmetic_operators
@@ -587,15 +593,23 @@ def test_cross_type_arithmetic():
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_mean_included():
+@pytest.mark.parametrize('op', ['sum', 'min', 'max', 'prod'])
+def test_preserve_dtypes(op):
+    # TODO(#22346): preserve Int64 dtype
+    # for ops that enable (mean would actually work here
+    # but generally it is a float return value)
     df = pd.DataFrame({
         "A": ['a', 'b', 'b'],
         "B": [1, None, 3],
         "C": integer_array([1, None, 3], dtype='Int64'),
     })
 
-    result = df.groupby("A").sum()
-    # TODO(#22346): preserve Int64 dtype
+    # op
+    result = getattr(df.C, op)()
+    assert isinstance(result, int)
+
+    # groupby
+    result = getattr(df.groupby("A"), op)()
     expected = pd.DataFrame({
         "B": np.array([1.0, 3.0]),
         "C": np.array([1, 3], dtype="int64")
@@ -603,6 +617,29 @@ def test_groupby_mean_included():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize('op', ['mean'])
+def test_reduce_to_float(op):
+    # some reduce ops always return float, even if the result
+    # is a rounded number
+    df = pd.DataFrame({
+        "A": ['a', 'b', 'b'],
+        "B": [1, None, 3],
+        "C": integer_array([1, None, 3], dtype='Int64'),
+    })
+
+    # op
+    result = getattr(df.C, op)()
+    assert isinstance(result, float)
+
+    # groupby
+    result = getattr(df.groupby("A"), op)()
+    expected = pd.DataFrame({
+        "B": np.array([1.0, 3.0]),
+        "C": np.array([1, 3], dtype="float64")
+    }, index=pd.Index(['a', 'b'], name='A'))
+    tm.assert_frame_equal(result, expected)
+
+
 def test_astype_nansafe():
     # https://github.com/pandas-dev/pandas/pull/22343
     arr = integer_array([np.nan, 1, 2], dtype="Int8")
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index f87c51a4ee16b9..882b2c156478a1 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -386,6 +386,8 @@ def test_is_datetime_or_timedelta_dtype():
     assert not com.is_datetime_or_timedelta_dtype(str)
     assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
     assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b']))
+    assert not com.is_datetime_or_timedelta_dtype(
+        DatetimeTZDtype("ns", "US/Eastern"))
 
     assert com.is_datetime_or_timedelta_dtype(np.datetime64)
     assert com.is_datetime_or_timedelta_dtype(np.timedelta64)
diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
index e1afedcade3ff7..12c37d1fdf895c 100644
--- a/pandas/tests/extension/arrow/test_bool.py
+++ b/pandas/tests/extension/arrow/test_bool.py
@@ -39,6 +39,10 @@ def test_from_dtype(self, data):
         pytest.skip("GH-22666")
 
 
+class TestReduce(base.BaseNoReduceTests):
+    pass
+
+
 def test_is_bool_dtype(data):
     assert pd.api.types.is_bool_dtype(data)
     assert pd.core.common.is_bool_indexer(data)
diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
index b6b81bb941a59c..d11bb8b6beb779 100644
--- a/pandas/tests/extension/base/__init__.py
+++ b/pandas/tests/extension/base/__init__.py
@@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests):
 from .interface import BaseInterfaceTests  # noqa
 from .methods import BaseMethodsTests  # noqa
 from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil  # noqa
+from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests  # noqa
 from .missing import BaseMissingTests  # noqa
 from .reshaping import BaseReshapingTests  # noqa
 from .setitem import BaseSetitemTests  # noqa
diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
index 174997c7d51e17..52c635d286df6e 100644
--- a/pandas/tests/extension/base/groupby.py
+++ b/pandas/tests/extension/base/groupby.py
@@ -25,8 +25,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
                            "B": data_for_grouping})
         result = df.groupby("B", as_index=as_index).A.mean()
         _, index = pd.factorize(data_for_grouping, sort=True)
-        # TODO(ExtensionIndex): remove astype
-        index = pd.Index(index.astype(object), name="B")
+
+        index = pd.Index(index, name="B")
         expected = pd.Series([3, 1, 4], index=index, name="A")
         if as_index:
             self.assert_series_equal(result, expected)
@@ -39,8 +39,8 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
                            "B": data_for_grouping})
         result = df.groupby("B", sort=False).A.mean()
         _, index = pd.factorize(data_for_grouping, sort=False)
-        # TODO(ExtensionIndex): remove astype
-        index = pd.Index(index.astype(object), name="B")
+
+        index = pd.Index(index, name="B")
         expected = pd.Series([1, 3, 4], index=index, name="A")
         self.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py
new file mode 100644
index 00000000000000..4f6c7988314c02
--- /dev/null
+++ b/pandas/tests/extension/base/reduce.py
@@ -0,0 +1,58 @@
+import warnings
+import pytest
+import pandas.util.testing as tm
+import pandas as pd
+from .base import BaseExtensionTests
+
+
+class BaseReduceTests(BaseExtensionTests):
+    """
+    Reduction specific tests. Generally these only
+    make sense for numeric/boolean operations.
+    """
+    def check_reduce(self, s, op_name, skipna):
+        result = getattr(s, op_name)(skipna=skipna)
+        expected = getattr(s.astype('float64'), op_name)(skipna=skipna)
+        tm.assert_almost_equal(result, expected)
+
+
+class BaseNoReduceTests(BaseReduceTests):
+    """ we don't define any reductions """
+
+    @pytest.mark.parametrize('skipna', [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
+        s = pd.Series(data)
+
+        with pytest.raises(TypeError):
+            getattr(s, op_name)(skipna=skipna)
+
+    @pytest.mark.parametrize('skipna', [True, False])
+    def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
+        op_name = all_boolean_reductions
+        s = pd.Series(data)
+
+        with pytest.raises(TypeError):
+            getattr(s, op_name)(skipna=skipna)
+
+
+class BaseNumericReduceTests(BaseReduceTests):
+
+    @pytest.mark.parametrize('skipna', [True, False])
+    def test_reduce_series(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
+        s = pd.Series(data)
+
+        # min/max with empty produce numpy warnings
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            self.check_reduce(s, op_name, skipna)
+
+
+class BaseBooleanReduceTests(BaseReduceTests):
+
+    @pytest.mark.parametrize('skipna', [True, False])
+    def test_reduce_series(self, data, all_boolean_reductions, skipna):
+        op_name = all_boolean_reductions
+        s = pd.Series(data)
+        self.check_reduce(s, op_name, skipna)
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
index a1ee3a4fefef24..53a598559393c3 100644
--- a/pandas/tests/extension/decimal/array.py
+++ b/pandas/tests/extension/decimal/array.py
@@ -134,6 +134,18 @@ def _na_value(self):
     def _concat_same_type(cls, to_concat):
         return cls(np.concatenate([x._data for x in to_concat]))
 
+    def _reduce(self, name, skipna=True, **kwargs):
+
+        if skipna:
+            raise NotImplementedError("decimal does not support skipna=True")
+
+        try:
+            op = getattr(self.data, name)
+        except AttributeError:
+            raise NotImplementedError("decimal does not support "
+                                      "the {} operation".format(name))
+        return op(axis=0)
+
 
 def to_decimal(values, context=None):
     return DecimalArray([decimal.Decimal(x) for x in values], context=context)
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
index 6724e183a06065..f84d24295b049b 100644
--- a/pandas/tests/extension/decimal/test_decimal.py
+++ b/pandas/tests/extension/decimal/test_decimal.py
@@ -131,6 +131,28 @@ class TestMissing(BaseDecimal, base.BaseMissingTests):
     pass
 
 
+class Reduce(object):
+
+    def check_reduce(self, s, op_name, skipna):
+
+        if skipna or op_name in ['median', 'skew', 'kurt']:
+            with pytest.raises(NotImplementedError):
+                getattr(s, op_name)(skipna=skipna)
+
+        else:
+            result = getattr(s, op_name)(skipna=skipna)
+            expected = getattr(np.asarray(s), op_name)()
+            tm.assert_almost_equal(result, expected)
+
+
+class TestNumericReduce(Reduce, base.BaseNumericReduceTests):
+    pass
+
+
+class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests):
+    pass
+
+
 class TestMethods(BaseDecimal, base.BaseMethodsTests):
     @pytest.mark.parametrize('dropna', [True, False])
     @pytest.mark.xfail(reason="value_counts not implemented yet.")
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index 6c8b12ed865fc1..15d99f6c5d2fc3 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -160,6 +160,10 @@ def test_fillna_frame(self):
                               reason="Dictionary order unstable")
 
 
+class TestReduce(base.BaseNoReduceTests):
+    pass
+
+
 class TestMethods(BaseJSON, base.BaseMethodsTests):
     @unhashable
     def test_value_counts(self, all_data, dropna):
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index f118279c4b9157..a4518798aa4006 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -164,6 +164,10 @@ def test_fillna_limit_backfill(self):
         pass
 
 
+class TestReduce(base.BaseNoReduceTests):
+    pass
+
+
 class TestMethods(base.BaseMethodsTests):
     pass
 
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
index fa5c89d85e5481..89c36bbe7b3252 100644
--- a/pandas/tests/extension/test_integer.py
+++ b/pandas/tests/extension/test_integer.py
@@ -207,18 +207,12 @@ class TestCasting(base.BaseCastingTests):
 
 
 class TestGroupby(base.BaseGroupbyTests):
+    pass
+
+
+class TestNumericReduce(base.BaseNumericReduceTests):
+    pass
+
 
-    @pytest.mark.xfail(reason="groupby not working", strict=True)
-    def test_groupby_extension_no_sort(self, data_for_grouping):
-        super(TestGroupby, self).test_groupby_extension_no_sort(
-            data_for_grouping)
-
-    @pytest.mark.parametrize('as_index', [
-        pytest.param(True,
-                     marks=pytest.mark.xfail(reason="groupby not working",
-                                             strict=True)),
-        False
-    ])
-    def test_groupby_extension_agg(self, as_index, data_for_grouping):
-        super(TestGroupby, self).test_groupby_extension_agg(
-            as_index, data_for_grouping)
+class TestBooleanReduce(base.BaseBooleanReduceTests):
+    pass
diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py
index 7302c5757d1447..183ebea927b105 100644
--- a/pandas/tests/extension/test_interval.py
+++ b/pandas/tests/extension/test_interval.py
@@ -98,6 +98,10 @@ class TestInterface(BaseInterval, base.BaseInterfaceTests):
     pass
 
 
+class TestReduce(base.BaseNoReduceTests):
+    pass
+
+
 class TestMethods(BaseInterval, base.BaseMethodsTests):
 
     @pytest.mark.skip(reason='addition is not defined for intervals')