Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: GroupBy.get_group raises ValueError when group key contains NaT #6996

Merged
merged 1 commit into from
May 30, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems:

df.groupby('A').std()

NA group handling
~~~~~~~~~~~~~~~~~
NA and NaT group handling
~~~~~~~~~~~~~~~~~~~~~~~~~

If there are any NaN values in the grouping key, these will be automatically
excluded. So there will never be an "NA group". This was not the case in older
If there are any NaN or NaT values in the grouping key, these will be automatically
excluded. So there will never be an "NA group" or "NaT group". This was not the case in older
versions of pandas, but users were generally discarding the NA group anyway
(and supporting it was an implementation headache).

Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,19 @@ Bug Fixes
- Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
- Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)


- Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`)
- Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`)



- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`)


- Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`)
- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`)


- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`)


6 changes: 5 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,11 @@ def convert(key, s):
return Timestamp(key).asm8
return key

sample = next(iter(self.indices))
if len(self.indices) > 0:
sample = next(iter(self.indices))
else:
sample = None # Dummy sample

if isinstance(sample, tuple):
if not isinstance(name, tuple):
msg = ("must supply a tuple to get_group with multiple"
Expand Down
4 changes: 3 additions & 1 deletion pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

cimport util
from util cimport is_array, _checknull, _checknan, get_nat
cimport lib
from lib cimport is_null_datetimelike

cdef int64_t iNaT = get_nat()

Expand Down Expand Up @@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down
14 changes: 8 additions & 6 deletions pandas/src/generated.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ ctypedef unsigned char UChar

cimport util
from util cimport is_array, _checknull, _checknan, get_nat
cimport lib
from lib cimport is_null_datetimelike

cdef int64_t iNaT = get_nat()

Expand Down Expand Up @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down
44 changes: 43 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,6 @@ def test_get_group(self):
expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
assert_panel_equal(gp, expected)


# GH 5267
# be datelike friendly
df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013',
Expand Down Expand Up @@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self):
result = df.groupby(['foo', 'bar']).mean()
expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]

def test_groupby_nat_exclude(self):
# GH 6992
df = pd.DataFrame({'values': np.random.randn(8),
'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'),
np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')],
'str': [np.nan, 'a', np.nan, 'a',
np.nan, 'a', np.nan, 'b']})
grouped = df.groupby('dt')

expected = [[1, 7], [3, 5]]
keys = sorted(grouped.groups.keys())
self.assertEqual(len(keys), 2)
for k, e in zip(keys, expected):
# grouped.groups keys are np.datetime64 with system tz
# not to be affected by tz, only compare values
self.assertEqual(grouped.groups[k], e)

# confirm obj is not filtered
tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
self.assertEqual(grouped.ngroups, 2)
expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]),
Timestamp('2013-02-01 00:00:00'): np.array([3, 5])}
for k in grouped.indices:
self.assert_numpy_array_equal(grouped.indices[k], expected[k])

tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])

self.assertRaises(KeyError, grouped.get_group, pd.NaT)

nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
'nat': [pd.NaT, pd.NaT, pd.NaT]})
self.assertEqual(nan_df['nan'].dtype, 'float64')
self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]')

for key in ['nan', 'nat']:
grouped = nan_df.groupby(key)
self.assertEqual(grouped.groups, {})
self.assertEqual(grouped.ngroups, 0)
self.assertEqual(grouped.indices, {})
self.assertRaises(KeyError, grouped.get_group, np.nan)
self.assertRaises(KeyError, grouped.get_group, pd.NaT)

def test_dictify(self):
dict(iter(self.df.groupby('A')))
dict(iter(self.df.groupby(['A', 'B'])))
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1858,6 +1858,25 @@ def test_ufunc_compat(self):
expected = Float64Index(np.sin(np.arange(5,dtype='int64')))
tm.assert_index_equal(result, expected)

def test_index_groupby(self):
int_idx = Index(range(6))
float_idx = Index(np.arange(0, 0.6, 0.1))
obj_idx = Index('A B C D E F'.split())
dt_idx = pd.date_range('2013-01-01', freq='M', periods=6)

for idx in [int_idx, float_idx, obj_idx, dt_idx]:
to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1])
self.assertEqual(idx.groupby(to_groupby),
{1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]})

to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1),
pd.NaT, pd.NaT,
datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values

ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')]))
expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]}
self.assertEqual(idx.groupby(to_groupby), expected)


class TestFloat64Index(Numeric, tm.TestCase):
_holder = Float64Index
Expand Down