diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 7ad2641dec52a..573f476ec64d3 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -NA group handling +NA and NaT group handling ~~~~~~~~~~~~~~~~~ -If there are any NaN values in the grouping key, these will be automatically -excluded. So there will never be an "NA group". This was not the case in older +If there are any NaN or NaT values in the grouping key, these will be automatically +excluded. So there will never be an "NA group" or "NaT group". This was not the case in older versions of pandas, but users were generally discarding the NA group anyway (and supporting it was an implementation headache). diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 2a4a408643451..141310aa84838 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -66,4 +66,11 @@ Bug Fixes - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) + - Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`) + + + + +- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) + diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 5f68c1ee26e87..90ebffce6c921 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -61,6 +61,8 @@ cdef extern from "src/headers/math.h": int signbit(double) from pandas import lib +from pandas import tslib +cdef object NaT = tslib.NaT include "skiplist.pyx" @@ -2010,7 +2012,7 @@ def groupby_indices(ndarray values): k = labels[i] # was NaN - if k == -1: + if k == -1 or k is NaT: continue loc = seen[k] @@ -2043,7 +2045,7 @@ def group_labels(ndarray[object] values): val = values[i] # is NaN - if val != val: + if val != val or val is NaT: labels[i] = -1 continue diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4b7d8b9796f01..0531fd447be9f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -426,7 +426,11 @@ def convert(key, s): return Timestamp(key).asm8 return key - sample = next(iter(self.indices)) + if len(self.indices) > 0: + sample = next(iter(self.indices)) + else: + sample = None # Dummy sample + if isinstance(sample, tuple): if not isinstance(name, tuple): msg = ("must supply a tuple to get_group with multiple" diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index a0cdc0ff5e841..598cdff30e4f7 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -37,6 +37,8 @@ cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 79722a26ebedc..428decd4dca10 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -28,6 +28,8 @@ ctypedef unsigned char UChar cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c308308603167..0789e20df3945 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -699,7 +699,6 @@ def test_get_group(self): expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) assert_panel_equal(gp, expected) - # GH 5267 # be datelike friendly df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', @@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self): result = df.groupby(['foo', 'bar']).mean() expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + def test_groupby_nat_exclude(self): + # GH 6992 + df = pd.DataFrame({'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'), + np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', + np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [[1, 7], [3, 5]] + keys = sorted(grouped.groups.keys()) + self.assertEqual(len(keys), 2) + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + self.assertEqual(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + self.assertEqual(grouped.ngroups, 2) + expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5])} + for k in grouped.indices: + self.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + self.assertEqual(nan_df['nan'].dtype, 'float64') + self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]') + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + self.assertEqual(grouped.groups, {}) + self.assertEqual(grouped.ngroups, 0) + self.assertEqual(grouped.indices, {}) + self.assertRaises(KeyError, grouped.get_group, np.nan) + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + def test_dictify(self): dict(iter(self.df.groupby('A'))) dict(iter(self.df.groupby(['A', 'B']))) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 444aa2a0bab1e..93299292cf353 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1858,6 +1858,25 @@ def test_ufunc_compat(self): expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) tm.assert_index_equal(result, expected) + def test_index_groupby(self): + int_idx = Index(range(6)) + float_idx = Index(np.arange(0, 0.6, 0.1)) + obj_idx = Index('A B C D E F'.split()) + dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + + for idx in [int_idx, float_idx, obj_idx, dt_idx]: + to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) + self.assertEqual(idx.groupby(to_groupby), + {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]}) + + to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1), + pd.NaT, pd.NaT, + datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values + + ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')])) + expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]} + self.assertEqual(idx.groupby(to_groupby), expected) + class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index