REF: implement DataFrame reductions blockwsie

pandas-dev · jreback · Jan 1, 2020 · Nov 25, 2019 · Nov 25, 2019 · Nov 25, 2019
commit d1d07ffe81b1a007f04f5b2fa2ce245b8d9f5994
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7606,6 +7606,46 @@ def _reduce(
         def f(x):
             return op(x, axis=axis, skipna=skipna, **kwds)
 
+        def _get_data(axis_matters):
+            if filter_type is None or filter_type == "numeric":
+                data = self._get_numeric_data()
+            elif filter_type == "bool":
+                if axis_matters:
+                    # GH#25101, GH#24434
+                    data = self._get_bool_data() if axis == 0 else self
+                else:
+                    data = self._get_bool_data()
+            else:  # pragma: no cover
+                msg = (
+                    "Generating numeric_only data with filter_type {f}"
+                    "not supported.".format(f=filter_type)
+                )
+                raise NotImplementedError(msg)
+            return data
+
+        if self.size == 0:
+            pass
+
+        elif numeric_only is False:
+            res = self._data.reduce(op)
+            assert isinstance(res, dict)
+            assert len(res) == max(list(res.keys())) + 1, res.keys()
+            out = self._constructor_sliced(res, index=range(len(res)))
+            out.index = self.columns
+            return out
+
+        elif numeric_only is True and axis == 0:
+            data = _get_data(axis_matters=True)
+            return data._reduce(
+                op,
+                name,
+                axis=axis,
+                skipna=skipna,
+                numeric_only=False,
+                filter_type=filter_type,
+                **kwds,
+            )
+
         if numeric_only is None:
             values = self.values
             try:
@@ -7616,7 +7656,7 @@ def f(x):
                     # TODO: combine with hasattr(result, 'dtype') further down
                     # hard since we don't have `values` down there.
                     result = np.bool_(result)
-            except TypeError as err:
+            except TypeError:
                 # e.g. in nanops trying to convert strs to float
 
                 # try by-column first
@@ -7639,31 +7679,14 @@ def f(x):
                         result = result.iloc[0]
                     return result
 
-                if filter_type is None or filter_type == "numeric":
-                    data = self._get_numeric_data()
-                elif filter_type == "bool":
-                    data = self._get_bool_data()
-                else:  # pragma: no cover
-                    raise NotImplementedError(
-                        "Handling exception with filter_type {f} not"
-                        "implemented.".format(f=filter_type)
-                    ) from err
+                # TODO: why doesnt axis matter here?
+                data = _get_data(axis_matters=False)
                 with np.errstate(all="ignore"):
                     result = f(data.values)
                 labels = data._get_agg_axis(axis)
         else:
             if numeric_only:
-                if filter_type is None or filter_type == "numeric":
-                    data = self._get_numeric_data()
-                elif filter_type == "bool":
-                    # GH 25101, # GH 24434
-                    data = self._get_bool_data() if axis == 0 else self
-                else:  # pragma: no cover
-                    msg = (
-                        "Generating numeric_only data with filter_type {f}"
-                        "not supported.".format(f=filter_type)
-                    )
-                    raise NotImplementedError(msg)
+                data = _get_data(axis_matters=True)
                 values = data.values
                 labels = data._get_agg_axis(axis)
             else:

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -343,6 +343,50 @@ def _verify_integrity(self):
                 "tot_items: {1}".format(len(self.items), tot_items)
             )
 
+    def reduce(self, func, *args, **kwargs):
+        # If 2D, we assume that we're operating column-wise
+        if self.ndim == 1:
+            # we'll be returning a scalar
+            blk = self.blocks[0]
+            return func(blk.values, *args, **kwargs)
+
+        res = {}
+        for blk in self.blocks:
+            bres = func(blk.values, *args, **kwargs)
+            if np.ndim(bres) == 0 and blk.shape[0] != 1:
+                # i.e. we reduced over all axes and not just one; re-do column-wise
+                new_res = {
+                    blk.mgr_locs.as_array[i]: func(blk.values[i], *args, **kwargs)
+                    for i in range(len(blk.values))
+                }
+            elif np.ndim(bres) == 0:
+                # EA
+                assert blk.shape[0] == 1, (
+                    blk.shape,
+                    blk.values.dtype,
+                    bres,
+                    func,
+                    args,
+                    kwargs,
+                )
+                new_res = zip(blk.mgr_locs.as_array, [bres])
+            else:
+                assert bres.ndim == 1, bres.shape
+                assert blk.shape[0] == len(bres), (
+                    blk.shape,
+                    bres.shape,
+                    func,
+                    args,
+                    kwargs,
+                )
+                new_res = zip(blk.mgr_locs.as_array, bres)
+
+            nr = dict(new_res)
+            assert not any(key in res for key in nr)
+            res.update(nr)
+
+        return res
+
     def apply(
         self,
         f,

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -772,7 +772,7 @@ def test_omit_nuisance(df):
 
     # won't work with axis = 1
     grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
-    msg = r"unsupported operand type\(s\) for \+: 'Timestamp'"
+    msg = "reduction operation 'sum' not allowed for this dtype"
     with pytest.raises(TypeError, match=msg):
         grouped.agg(lambda x: x.sum(0, numeric_only=False))