Skip to content

BUG: fixes pd.Grouper for non-datetimelike groupings #8866 #8964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ API changes

- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)

- Bug in pd.Grouper when specifying non-datetimelike grouping. (:issue:`8844`)

- ``Series.all`` and ``Series.any`` now support the ``level`` and ``skipna`` parameters. ``Series.all``, ``Series.any``, ``Index.all``, and ``Index.any`` no longer support the ``out`` and ``keepdims`` parameters, which existed for compatibility with ndarray. Various index types no longer support the ``all`` and ``any`` aggregation functions and will now raise ``TypeError``. (:issue:`8302`):

.. ipython:: python
Expand Down
91 changes: 83 additions & 8 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def __new__(cls, *args, **kwargs):
cls = TimeGrouper
return super(Grouper, cls).__new__(cls)

def __init__(self, key=None, level=None, freq=None, axis=None, sort=False):
def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
self.key=key
self.level=level
self.freq=freq
Expand Down Expand Up @@ -232,22 +232,40 @@ def _get_grouper(self, obj):

def _set_grouper(self, obj, sort=False):
"""
given an object and the specifcations, setup the internal grouper for this particular specification
given an object and the specifications, setup the internal grouper
for this particular specification

Parameters
----------
obj : the subject object

"""
if self.freq is not None:
self._set_timegrouper(obj)
return self.grouper
else:
self._set_basegrouper(obj)

def _set_timegrouper(self, obj, sort=False):
"""
given an object and the specifications, setup the internal grouper
as a Datetime Index

Parameters
----------
obj : the subject object

"""
if self.key is not None and self.level is not None:
raise ValueError("The Grouper cannot specify both a key and a level!")
raise ValueError("The Grouper cannot specify both a key and \
a level!")

# the key must be a valid info item
if self.key is not None:
key = self.key
if key not in obj._info_axis:
raise KeyError("The grouper name {0} is not found".format(key))
raise KeyError("The grouper name {0} is not found"
.format(key))
ax = Index(obj[key], name=key)

else:
Expand All @@ -259,20 +277,78 @@ def _set_grouper(self, obj, sort=False):
# equivalent to the axis name
if isinstance(ax, MultiIndex):
level = ax._get_level_number(level)
ax = Index(ax.get_level_values(level), name=ax.names[level])
ax = Index(ax.get_level_values(level),
name=ax.names[level])

else:
if level not in (0, ax.name):
raise ValueError("The level {0} is not valid".format(level))
raise ValueError("The level {0} is not valid"
.format(level))

# possibly sort
if (self.sort or sort) and not ax.is_monotonic:
indexer = self.indexer = ax.argsort(kind='quicksort')
ax = ax.take(indexer)
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
obj = obj.take(indexer, axis=self.axis, convert=False,
is_copy=False)

self.obj = obj
self.grouper = ax

def _set_basegrouper(self, obj, sort=False):
"""
given an object and the specifications, setup the internal grouper
as a BaseGrouper Class

Parameters
----------
obj : the subject object

"""
ax = obj._get_axis(self.axis)
gpr = self.key
if self.key is not None:
if self.key not in obj._info_axis:
raise KeyError("The grouper name {0} is not found"
.format(self.key))
else:
if self.level is not None:
if not isinstance(ax, MultiIndex):
if self.level not in (0, ax.name):
raise ValueError("The level {0} is not valid"
.format(self.level))

def is_in_axis(key):
if not _is_label_like(key):
try:
obj._data.items.get_loc(key)
except Exception:
return False
return True

# if the the grouper is obj[name]
def is_in_obj(gpr):
try:
return id(gpr) == id(obj[gpr.name])
except Exception:
return False

if is_in_obj(gpr): # df.groupby(df['name'])
in_axis, name = True, gpr.name
elif is_in_axis(gpr): # df.groupby('name')
in_axis, name, gpr = True, gpr, obj[gpr]
else:
in_axis, name = False, None

if isinstance(self.key, Categorical) and len(gpr) != len(obj):
raise ValueError("Categorical grouper must have len(grouper) \
== len(data)")

grouping = [Grouping(ax, gpr, obj=obj, name=name,
level=self.level, sort=sort, in_axis=in_axis)]
grouper = BaseGrouper(ax, grouping)
self.obj = obj
self.grouper = grouper
return self.grouper

def _get_binner_for_grouping(self, obj):
Expand Down Expand Up @@ -2137,7 +2213,6 @@ def is_in_obj(gpr):

ping = Grouping(group_axis, gpr, obj=obj, name=name,
level=level, sort=sort, in_axis=in_axis)

groupings.append(ping)

if len(groupings) == 0:
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3677,6 +3677,16 @@ def test_timegrouper_get_group(self):
dt = pd.Timestamp(t)
result = grouped.get_group(dt)
assert_frame_equal(result, expected)

def test_grouper_with_nondatetime(self):
# GH 8866
s = Series(np.arange(8),index=pd.MultiIndex.from_product([list('ab'),
range(2),pd.date_range('20130101',periods=2)],
names=['one','two','three']))

expected = Series(data = [6,22], index=pd.Index(['a','b'], name='one'))
result = s.groupby(pd.Grouper(level='one')).sum()
assert_series_equal(result,expected)

def test_cumcount(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
Expand Down