Skip to content

Commit

Permalink
BUG: GroupBy.get_group raises ValueError when group key contains NaT
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed May 16, 2015
1 parent d03a22f commit 3605738
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 14 deletions.
6 changes: 3 additions & 3 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems:
df.groupby('A').std()
NA group handling
NA and NaT group handling
~~~~~~~~~~~~~~~~~

If there are any NaN values in the grouping key, these will be automatically
excluded. So there will never be an "NA group". This was not the case in older
If there are any NaN or NaT values in the grouping key, these will be automatically
excluded. So there will never be an "NA group" or "NaT group". This was not the case in older
versions of pandas, but users were generally discarding the NA group anyway
(and supporting it was an implementation headache).

Expand Down
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,11 @@ Bug Fixes
- Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
- Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)


- Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`)




- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`)

6 changes: 4 additions & 2 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ cdef extern from "src/headers/math.h":
int signbit(double)

from pandas import lib
from pandas import tslib
cdef object NaT = tslib.NaT

include "skiplist.pyx"

Expand Down Expand Up @@ -2010,7 +2012,7 @@ def groupby_indices(ndarray values):
k = labels[i]

# was NaN
if k == -1:
if k == -1 or k is NaT:
continue

loc = seen[k]
Expand Down Expand Up @@ -2043,7 +2045,7 @@ def group_labels(ndarray[object] values):
val = values[i]

# is NaN
if val != val:
if val != val or val is NaT:
labels[i] = -1
continue

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,11 @@ def convert(key, s):
return Timestamp(key).asm8
return key

sample = next(iter(self.indices))
if len(self.indices) > 0:
sample = next(iter(self.indices))
else:
sample = None # Dummy sample

if isinstance(sample, tuple):
if not isinstance(name, tuple):
msg = ("must supply a tuple to get_group with multiple"
Expand Down
4 changes: 3 additions & 1 deletion pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
cimport util
from util cimport is_array, _checknull, _checknan, get_nat
cimport lib
from lib cimport is_null_datetimelike
cdef int64_t iNaT = get_nat()
Expand Down Expand Up @@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)
if _checknull(key):
if is_null_datetimelike(key):
continue
idx = index[i]
Expand Down
14 changes: 8 additions & 6 deletions pandas/src/generated.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ ctypedef unsigned char UChar

cimport util
from util cimport is_array, _checknull, _checknan, get_nat
cimport lib
from lib cimport is_null_datetimelike

cdef int64_t iNaT = get_nat()

Expand Down Expand Up @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down Expand Up @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels):
for i in range(length):
key = util.get_value_1d(labels, i)

if _checknull(key):
if is_null_datetimelike(key):
continue

idx = index[i]
Expand Down
44 changes: 43 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,6 @@ def test_get_group(self):
expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
assert_panel_equal(gp, expected)


# GH 5267
# be datelike friendly
df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013',
Expand Down Expand Up @@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self):
result = df.groupby(['foo', 'bar']).mean()
expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]

def test_groupby_nat_exclude(self):
# GH 6992
df = pd.DataFrame({'values': np.random.randn(8),
'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'),
np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')],
'str': [np.nan, 'a', np.nan, 'a',
np.nan, 'a', np.nan, 'b']})
grouped = df.groupby('dt')

expected = [[1, 7], [3, 5]]
keys = sorted(grouped.groups.keys())
self.assertEqual(len(keys), 2)
for k, e in zip(keys, expected):
# grouped.groups keys are np.datetime64 with system tz
# not to be affected by tz, only compare values
self.assertEqual(grouped.groups[k], e)

# confirm obj is not filtered
tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
self.assertEqual(grouped.ngroups, 2)
expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]),
Timestamp('2013-02-01 00:00:00'): np.array([3, 5])}
for k in grouped.indices:
self.assert_numpy_array_equal(grouped.indices[k], expected[k])

tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])

self.assertRaises(KeyError, grouped.get_group, pd.NaT)

nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
'nat': [pd.NaT, pd.NaT, pd.NaT]})
self.assertEqual(nan_df['nan'].dtype, 'float64')
self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]')

for key in ['nan', 'nat']:
grouped = nan_df.groupby(key)
self.assertEqual(grouped.groups, {})
self.assertEqual(grouped.ngroups, 0)
self.assertEqual(grouped.indices, {})
self.assertRaises(KeyError, grouped.get_group, np.nan)
self.assertRaises(KeyError, grouped.get_group, pd.NaT)

def test_dictify(self):
dict(iter(self.df.groupby('A')))
dict(iter(self.df.groupby(['A', 'B'])))
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1858,6 +1858,25 @@ def test_ufunc_compat(self):
expected = Float64Index(np.sin(np.arange(5,dtype='int64')))
tm.assert_index_equal(result, expected)

def test_index_groupby(self):
int_idx = Index(range(6))
float_idx = Index(np.arange(0, 0.6, 0.1))
obj_idx = Index('A B C D E F'.split())
dt_idx = pd.date_range('2013-01-01', freq='M', periods=6)

for idx in [int_idx, float_idx, obj_idx, dt_idx]:
to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1])
self.assertEqual(idx.groupby(to_groupby),
{1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]})

to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1),
pd.NaT, pd.NaT,
datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values

ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')]))
expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]}
self.assertEqual(idx.groupby(to_groupby), expected)


class TestFloat64Index(Numeric, tm.TestCase):
_holder = Float64Index
Expand Down

0 comments on commit 3605738

Please sign in to comment.