Wired any/all into _get_cythonized_result

pandas-dev · jreback · Mar 1, 2018 · Feb 15, 2018 · Feb 15, 2018 · Feb 16, 2018
commit ae9126fa03b61ab526433036a20048fa1294c38c
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -314,18 +314,20 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_any(ndarray[int64_t] out,
-              ndarray values,
+def group_any(ndarray[uint8_t] out,
               ndarray[int64_t] labels,
+              ndarray[uint8_t] values,
+              ndarray[uint8_t] mask,
               bint skipna):
     """Aggregated boolean values to show if any group element is truthful
 
     Parameters
     ----------
-    out : array of int64_t values which this method will write its results to
-    values : array of values to be truth-tested
+    out : array of values which this method will write its results to
     labels : array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`
+    values : array containing the truth value of each element
+    mask : array indicating whether a value is na or not
     skipna : boolean
         Flag to ignore nan values during truth testing
 
@@ -337,40 +339,33 @@ def group_any(ndarray[int64_t] out,
     cdef:
         Py_ssize_t i, N=len(labels)
         int64_t lab
-        ndarray[int64_t] bool_mask
-        ndarray[uint8_t] isna_mask
-
-    if values.dtype == 'object':
-        bool_mask = np.array([bool(x) for x in values]).astype(np.int64)
-        isna_mask = missing.isnaobj(values).astype(np.uint8)
-    else:
-        bool_mask = values.astype(np.bool).astype(np.int64)
-        isna_mask = np.isnan(values).astype(np.uint8)
 
     with nogil:
         for i in range(N):
             lab = labels[i]
-            if lab < 0 or (skipna and isna_mask[i]):
+            if lab < 0 or (skipna and mask[i]):
                 continue
 
-            if bool_mask[i]:
+            if values[i]:
                 out[lab] = 1
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_all(ndarray[int64_t] out,
-              ndarray values,
+def group_all(ndarray[uint8_t] out,
               ndarray[int64_t] labels,
+              ndarray[uint8_t] values,
+              ndarray[uint8_t] mask,
               bint skipna):
     """Aggregated boolean values to show if all group elements are truthful
 
     Parameters
     ----------
-    out : array of int64_t values which this method will write its results to
-    values : array of values to be truth-tested
+    out : array of values which this method will write its results to
     labels : array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`
+    values : array containing the truth value of each element
+    mask : array indicating whether a value is na or not
     skipna : boolean
         Flag to ignore nan values during truth testing
 

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1219,6 +1219,29 @@ class GroupBy(_GroupBy):
     """
     _apply_whitelist = _common_apply_whitelist
 
+    def _bool_agg(self, how, skipna):
+        """Shared func to call any / all Cython GroupBy implementations"""
+
+        def objs_to_bool(vals):
+            try:
+                vals = vals.astype(np.bool)
+            except ValueError:  # for objects
+                vals = np.array([bool(x) for x in vals])
+
+            return vals.view(np.uint8)
+
+        def result_to_bool(result):
+            return result.astype(np.bool, copy=False)
+
+        return self._get_cythonized_result(how, self.grouper,
+                                           aggregate=True,
+                                           cython_dtype=np.uint8,
+                                           needs_values=True,
+                                           needs_mask=True,
+                                           pre_processing=objs_to_bool,
+                                           post_processing=result_to_bool,
+                                           skipna=skipna)
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def any(self, skipna=True):
@@ -1229,15 +1252,19 @@ def any(self, skipna=True):
         skipna : bool, default True
             Flag to ignore nan values during truth testing
         """
-        labels, _, _ = self.grouper.group_info
-        output = collections.OrderedDict()
+        return self._bool_agg('group_any', skipna)
 
-        for name, obj in self._iterate_slices():
-            result = np.zeros(self.ngroups, dtype=np.int64)
-            libgroupby.group_any(result, obj.values, labels, skipna)
-            output[name] = result.astype(np.bool)
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def all(self, skipna=True):
+        """Returns True if all values in the group are truthful, else False
 
-        return self._wrap_aggregated_output(output)
+        Parameters
+        ----------
+        skipna : bool, default True
+            Flag to ignore nan values during truth testing
+        """
+        return self._bool_agg('group_all', skipna)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1505,6 +1532,8 @@ def _fill(self, direction, limit=None):
 
         return self._get_cythonized_result('group_fillna_indexer',
                                            self.grouper, needs_mask=True,
+                                           cython_dtype=np.int64,
+                                           result_is_index=True,
                                            direction=direction, limit=limit)
 
     @Substitution(name='groupby')
@@ -1893,33 +1922,81 @@ def cummax(self, axis=0, **kwargs):
 
         return self._cython_transform('cummax', numeric_only=False)
 
-    def _get_cythonized_result(self, how, grouper, needs_mask=False,
-                               needs_ngroups=False, **kwargs):
+    def _get_cythonized_result(self, how, grouper, aggregate=False,
+                               cython_dtype=None, needs_values=False,
+                               needs_mask=False, needs_ngroups=False,
+                               result_is_index=False,
+                               pre_processing=None, post_processing=None,
+                               **kwargs):
         """Get result for Cythonized functions
 
         Parameters
         ----------
         how : str, Cythonized function name to be called
         grouper : Grouper object containing pertinent group info
+        aggregate : bool, default False
+            Whether the result should be aggregated to match the number of
+            groups
+        cython_dtype : default None
+            Type of the array that will be modified by the Cython call. If
+            `None`, the type will be inferred from the values of each slice
+        needs_values : bool, default False
+            Whether the values should be a part of the Cython call
+            signature
         needs_mask : bool, default False
-            Whether boolean mask needs to be part of the Cython call signature
+            Whether boolean mask needs to be part of the Cython call
+            signature
         needs_ngroups : bool, default False
-            Whether number of groups part of the Cython call signature
+            Whether number of groups is part of the Cython call signature
+        result_is_index : bool, default False
+            Whether the result of the Cython operation is an index of
+            values to be retrieved, instead of the actual values themselves
+        pre_processing : function, default None
+            Function to be applied to `values` prior to passing to Cython
+            Raises if `needs_values` is False
+        post_processing : function, default None
+            Function to be applied to result of Cython function
         **kwargs : dict
             Extra arguments to be passed back to Cython funcs
 
         Returns
         -------
         `Series` or `DataFrame`  with filled values
         """
+        if result_is_index and aggregate:
+            raise ValueError("'result_is_index' and 'aggregate' cannot both "
+                             "be True!")
+        if post_processing:
+            if not callable(pre_processing):
+                raise ValueError("'post_processing' must be a callable!")
+        if pre_processing:
+            if not callable(pre_processing):
+                raise ValueError("'pre_processing' must be a callable!")
+            if not needs_values:
+                raise ValueError("Cannot use 'pre_processing' without "
+                                 "specifying 'needs_values'!")
 
         labels, _, ngroups = grouper.group_info
         output = collections.OrderedDict()
         base_func = getattr(libgroupby, how)
 
         for name, obj in self._iterate_slices():
-            indexer = np.zeros_like(labels, dtype=np.int64)
-            func = partial(base_func, indexer, labels)
+            if aggregate:
+                result_sz = ngroups
+            else:
+                result_sz = len(obj.values)
+
+            if not cython_dtype:
+                cython_dtype = obj.values.dtype
+
+            result = np.zeros(result_sz, dtype=cython_dtype)
+            func = partial(base_func, result, labels)
+            if needs_values:
+                vals = obj.values
+                if pre_processing:
+                    vals = pre_processing(vals)
+                func = partial(func, vals)
+
             if needs_mask:
                 mask = isnull(obj.values).view(np.uint8)
                 func = partial(func, mask)
@@ -1928,9 +2005,19 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False,
                 func = partial(func, ngroups)
 
             func(**kwargs)  # Call func to modify indexer values in place
-            output[name] = algorithms.take_nd(obj.values, indexer)
 
-        return self._wrap_transformed_output(output)
+            if result_is_index:
+                result = algorithms.take_nd(obj.values, result)
+
+            if post_processing:
+                result = post_processing(result)
+
+            output[name] = result
+
+        if aggregate:
+            return self._wrap_aggregated_output(output)
+        else:
+            return self._wrap_transformed_output(output)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1950,7 +2037,9 @@ def shift(self, periods=1, freq=None, axis=0):
             return self.apply(lambda x: x.shift(periods, freq, axis))
 
         return self._get_cythonized_result('group_shift_indexer',
-                                           self.grouper, needs_ngroups=True,
+                                           self.grouper, cython_dtype=np.int64,
+                                           needs_ngroups=True,
+                                           result_is_index=True,
                                            periods=periods)
 
     @Substitution(name='groupby')