From c94a68c60d0f2259cd0fa55182a723b51ea5edee Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 5 May 2018 08:44:13 -0400 Subject: [PATCH] DOC: followup to #20583, observed kwarg for .groupby (#20941) --- doc/source/groupby.rst | 2 +- doc/source/whatsnew/v0.23.0.txt | 8 +++++--- pandas/core/generic.py | 11 +++++------ pandas/core/groupby/groupby.py | 32 ++++++++++++++------------------ 4 files changed, 25 insertions(+), 28 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 3616a7e1b41d2..da13a34cccfea 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -994,7 +994,7 @@ is only interesting over one column (here ``colname``), it may be filtered Handling of (un)observed Categorical values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword +When using a ``Categorical`` grouper (as a single grouper, or as part of multipler groupers), the ``observed`` keyword controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those that are observed groupers (``observed=True``). diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 979fbb5ddfdd0..d7ede0f9cefca 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -419,9 +419,11 @@ documentation. If you build an extension array, publicize it on our Categorical Groupers has gained an observed keyword ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for -each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward -compatible (generate a cartesian product). (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`) +Grouping by a categorical includes the unobserved categories in the output. +When grouping with multiple groupers, this means you get the cartesian product of all the +categories, including combinations where there are no observations, which can result in a large +number of groupers. We have added a keyword ``observed`` to control this behavior, it defaults to +``observed=False`` for backward-compatiblity. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`, :issue:`20902`) .. ipython:: python diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e96a2a9f08520..343f36eabc0d7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6584,7 +6584,7 @@ def clip_lower(self, threshold, axis=None, inplace=False): axis=axis, inplace=inplace) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, observed=None, **kwargs): + group_keys=True, squeeze=False, observed=False, **kwargs): """ Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns. @@ -6617,11 +6617,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, squeeze : boolean, default False reduce the dimensionality of the return type if possible, otherwise return a consistent type - observed : boolean, default None - if True: only show observed values for categorical groupers. - if False: show all values for categorical groupers. - if None: if any categorical groupers, show a FutureWarning, - default to False. + observed : boolean, default False + This only applies if any of the groupers are Categoricals + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. .. versionadded:: 0.23.0 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f78f7cb625218..164d1bebd2929 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -556,7 +556,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False, - observed=None, **kwargs): + observed=False, **kwargs): self._selection = selection @@ -2907,7 +2907,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=None, in_axis=False): + sort=True, observed=False, in_axis=False): self.name = name self.level = level @@ -2964,12 +2964,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - # observed can be True/False/None - # we treat None as False. If in the future - # we need to warn if observed is not passed - # then we have this option - # gh-20583 - self.all_grouper = self.grouper self.grouper = self.grouper._codes_for_groupby( self.sort, observed) @@ -3088,7 +3082,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=None, mutated=False, validate=True): + observed=False, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -4734,26 +4728,28 @@ def _wrap_agged_blocks(self, items, blocks): def _reindex_output(self, result): """ - if we have categorical groupers, then we want to make sure that + If we have categorical groupers, then we want to make sure that we have a fully reindex-output to the levels. These may have not participated in the groupings (e.g. may have all been - nan groups) + nan groups); This can re-expand the output space """ - # TODO(jreback): remove completely - # when observed parameter is defaulted to True - # gh-20583 - - if self.observed: - return result - + # we need to re-expand the output space to accomodate all values + # whether observed or not in the cartesian product of our groupes groupings = self.grouper.groupings if groupings is None: return result elif len(groupings) == 1: return result + + # if we only care about the observed values + # we are done + elif self.observed: + return result + + # reindexing only applies to a Categorical grouper elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) for ping in groupings): return result