Skip to content

Cythonized GroupBy any #19722

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 1, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Wired any/all into _get_cythonized_result
  • Loading branch information
WillAyd committed Feb 28, 2018
commit ae9126fa03b61ab526433036a20048fa1294c38c
33 changes: 14 additions & 19 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -314,18 +314,20 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_any(ndarray[int64_t] out,
ndarray values,
def group_any(ndarray[uint8_t] out,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we wanted to we could probably combine group_any and group_all into one function and just use an argument to tweak the execution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I think making group_any_all is better

ndarray[int64_t] labels,
ndarray[uint8_t] values,
ndarray[uint8_t] mask,
bint skipna):
"""Aggregated boolean values to show if any group element is truthful

Parameters
----------
out : array of int64_t values which this method will write its results to
values : array of values to be truth-tested
out : array of values which this method will write its results to
labels : array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
values : array containing the truth value of each element
mask : array indicating whether a value is na or not
skipna : boolean
Flag to ignore nan values during truth testing

Expand All @@ -337,40 +339,33 @@ def group_any(ndarray[int64_t] out,
cdef:
Py_ssize_t i, N=len(labels)
int64_t lab
ndarray[int64_t] bool_mask
ndarray[uint8_t] isna_mask

if values.dtype == 'object':
bool_mask = np.array([bool(x) for x in values]).astype(np.int64)
isna_mask = missing.isnaobj(values).astype(np.uint8)
else:
bool_mask = values.astype(np.bool).astype(np.int64)
isna_mask = np.isnan(values).astype(np.uint8)

with nogil:
for i in range(N):
lab = labels[i]
if lab < 0 or (skipna and isna_mask[i]):
if lab < 0 or (skipna and mask[i]):
continue

if bool_mask[i]:
if values[i]:
out[lab] = 1


@cython.boundscheck(False)
@cython.wraparound(False)
def group_all(ndarray[int64_t] out,
ndarray values,
def group_all(ndarray[uint8_t] out,
ndarray[int64_t] labels,
ndarray[uint8_t] values,
ndarray[uint8_t] mask,
bint skipna):
"""Aggregated boolean values to show if all group elements are truthful

Parameters
----------
out : array of int64_t values which this method will write its results to
values : array of values to be truth-tested
out : array of values which this method will write its results to
labels : array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
values : array containing the truth value of each element
mask : array indicating whether a value is na or not
skipna : boolean
Flag to ignore nan values during truth testing

Expand Down
121 changes: 105 additions & 16 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,29 @@ class GroupBy(_GroupBy):
"""
_apply_whitelist = _common_apply_whitelist

def _bool_agg(self, how, skipna):
"""Shared func to call any / all Cython GroupBy implementations"""

def objs_to_bool(vals):
try:
vals = vals.astype(np.bool)
except ValueError: # for objects
vals = np.array([bool(x) for x in vals])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have an asv that hits this (the object path)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not but happy to add. As far as objects are concerned, I see we have existing tests for first, last, nth, and count. I believe the gap there to be any, all, bfill, cumcount, ffill, head, nunique, shift, size, tail, unique and value_counts.

I'd suggest we group all of these into a GroupByMethodsObjects class (similar to the one for int / float). If you agree, would you want to move first, last, nth and count into that class or keep where they currently are? Moving them would keep the code-base more concise, but I'm not clear on how that would impact the history of benchmarks that you keep

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok to move
more important that the code is organized


return vals.view(np.uint8)

def result_to_bool(result):
return result.astype(np.bool, copy=False)

return self._get_cythonized_result(how, self.grouper,
aggregate=True,
cython_dtype=np.uint8,
needs_values=True,
needs_mask=True,
pre_processing=objs_to_bool,
post_processing=result_to_bool,
skipna=skipna)

@Substitution(name='groupby')
@Appender(_doc_template)
def any(self, skipna=True):
Expand All @@ -1229,15 +1252,19 @@ def any(self, skipna=True):
skipna : bool, default True
Flag to ignore nan values during truth testing
"""
labels, _, _ = self.grouper.group_info
output = collections.OrderedDict()
return self._bool_agg('group_any', skipna)

for name, obj in self._iterate_slices():
result = np.zeros(self.ngroups, dtype=np.int64)
libgroupby.group_any(result, obj.values, labels, skipna)
output[name] = result.astype(np.bool)
@Substitution(name='groupby')
@Appender(_doc_template)
def all(self, skipna=True):
"""Returns True if all values in the group are truthful, else False

return self._wrap_aggregated_output(output)
Parameters
----------
skipna : bool, default True
Flag to ignore nan values during truth testing
"""
return self._bool_agg('group_all', skipna)

@Substitution(name='groupby')
@Appender(_doc_template)
Expand Down Expand Up @@ -1505,6 +1532,8 @@ def _fill(self, direction, limit=None):

return self._get_cythonized_result('group_fillna_indexer',
self.grouper, needs_mask=True,
cython_dtype=np.int64,
result_is_index=True,
direction=direction, limit=limit)

@Substitution(name='groupby')
Expand Down Expand Up @@ -1893,33 +1922,81 @@ def cummax(self, axis=0, **kwargs):

return self._cython_transform('cummax', numeric_only=False)

def _get_cythonized_result(self, how, grouper, needs_mask=False,
needs_ngroups=False, **kwargs):
def _get_cythonized_result(self, how, grouper, aggregate=False,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had to add some new keywords to make this more dynamic and support aggregation in addition to the transformations it was already doing with shift and fillna. I think most could be generally applicable save result_is_index which may be specific to the existing shift and fillna ops, but I didn't think there was a clear way to pass in an anonymous func to post_processing that would have access to the sliced value that that function needs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok for now. these should be subclasses of common routines (in a separate module), then this will be much simpler / cleaner (but that's for later)

cython_dtype=None, needs_values=False,
needs_mask=False, needs_ngroups=False,
result_is_index=False,
pre_processing=None, post_processing=None,
**kwargs):
"""Get result for Cythonized functions

Parameters
----------
how : str, Cythonized function name to be called
grouper : Grouper object containing pertinent group info
aggregate : bool, default False
Whether the result should be aggregated to match the number of
groups
cython_dtype : default None
Type of the array that will be modified by the Cython call. If
`None`, the type will be inferred from the values of each slice
needs_values : bool, default False
Whether the values should be a part of the Cython call
signature
needs_mask : bool, default False
Whether boolean mask needs to be part of the Cython call signature
Whether boolean mask needs to be part of the Cython call
signature
needs_ngroups : bool, default False
Whether number of groups part of the Cython call signature
Whether number of groups is part of the Cython call signature
result_is_index : bool, default False
Whether the result of the Cython operation is an index of
values to be retrieved, instead of the actual values themselves
pre_processing : function, default None
Function to be applied to `values` prior to passing to Cython
Raises if `needs_values` is False
post_processing : function, default None
Function to be applied to result of Cython function
**kwargs : dict
Extra arguments to be passed back to Cython funcs

Returns
-------
`Series` or `DataFrame` with filled values
"""
if result_is_index and aggregate:
raise ValueError("'result_is_index' and 'aggregate' cannot both "
"be True!")
if post_processing:
if not callable(pre_processing):
raise ValueError("'post_processing' must be a callable!")
if pre_processing:
if not callable(pre_processing):
raise ValueError("'pre_processing' must be a callable!")
if not needs_values:
raise ValueError("Cannot use 'pre_processing' without "
"specifying 'needs_values'!")

labels, _, ngroups = grouper.group_info
output = collections.OrderedDict()
base_func = getattr(libgroupby, how)

for name, obj in self._iterate_slices():
indexer = np.zeros_like(labels, dtype=np.int64)
func = partial(base_func, indexer, labels)
if aggregate:
result_sz = ngroups
else:
result_sz = len(obj.values)

if not cython_dtype:
cython_dtype = obj.values.dtype

result = np.zeros(result_sz, dtype=cython_dtype)
func = partial(base_func, result, labels)
if needs_values:
vals = obj.values
if pre_processing:
vals = pre_processing(vals)
func = partial(func, vals)

if needs_mask:
mask = isnull(obj.values).view(np.uint8)
func = partial(func, mask)
Expand All @@ -1928,9 +2005,19 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False,
func = partial(func, ngroups)

func(**kwargs) # Call func to modify indexer values in place
output[name] = algorithms.take_nd(obj.values, indexer)

return self._wrap_transformed_output(output)
if result_is_index:
result = algorithms.take_nd(obj.values, result)

if post_processing:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is all ok, assuming we are going to move this out of here to a separate module/class soonish.

result = post_processing(result)

output[name] = result

if aggregate:
return self._wrap_aggregated_output(output)
else:
return self._wrap_transformed_output(output)

@Substitution(name='groupby')
@Appender(_doc_template)
Expand All @@ -1950,7 +2037,9 @@ def shift(self, periods=1, freq=None, axis=0):
return self.apply(lambda x: x.shift(periods, freq, axis))

return self._get_cythonized_result('group_shift_indexer',
self.grouper, needs_ngroups=True,
self.grouper, cython_dtype=np.int64,
needs_ngroups=True,
result_is_index=True,
periods=periods)

@Substitution(name='groupby')
Expand Down