BUG: Fix some PeriodIndex resampling issues (#16153)

winklerand · jreback · commit 7d4a260cbe6d · 2017-10-01T10:55:32.000-04:00
closes #15944 xref #12884 closes #13083 closes #13224
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -171,6 +171,82 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. _whatsnew_0210.api_breaking.period_index_resampling:
+
+``PeriodIndex`` resampling
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [1]: pi = pd.period_range('2017-01', periods=12, freq='M')
+
+   In [2]: s = pd.Series(np.arange(12), index=pi)
+
+   In [3]: resampled = s.resample('2Q').mean()
+
+   In [4]: resampled
+   Out[4]:
+   2017-03-31     1.0
+   2017-09-30     5.5
+   2018-03-31    10.0
+   Freq: 2Q-DEC, dtype: float64
+
+   In [5]: resampled.index
+   Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC')
+
+New Behavior:
+
+.. ipython:: python
+
+   pi = pd.period_range('2017-01', periods=12, freq='M')
+
+   s = pd.Series(np.arange(12), index=pi)
+
+   resampled = s.resample('2Q').mean()
+
+   resampled
+
+   resampled.index
+
+
+Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior.
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10)
+
+   In [2]: s = pd.Series(np.arange(10), index=pi)
+
+   In [3]: s.resample('H').ohlc()
+   Out[3]:
+   2000-01-01 00:00    0.0
+                   ...
+   2000-01-10 23:00    NaN
+   Freq: H, Length: 240, dtype: float64
+
+   In [4]: s.resample('M').ohlc()
+   Out[4]:
+            open  high  low  close
+   2000-01     0     9    0      9
+
+New Behavior:
+
+.. ipython:: python
+
+   pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10)
+
+   s = pd.Series(np.arange(10), index=pi)
+
+   s.resample('H').ohlc()
+
+   s.resample('M').ohlc()
+
 
 .. _whatsnew_0210.api_breaking.deps:
 
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -14,7 +14,7 @@
 from pandas.core.indexes.datetimes import DatetimeIndex, date_range
 from pandas.core.indexes.timedeltas import TimedeltaIndex
 from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds
-from pandas.core.indexes.period import PeriodIndex, period_range
+from pandas.core.indexes.period import PeriodIndex
 import pandas.core.common as com
 import pandas.core.algorithms as algos
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
@@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler):
     def _resampler_for_grouping(self):
         return PeriodIndexResamplerGroupby
 
+    def _get_binner_for_time(self):
+        if self.kind == 'timestamp':
+            return super(PeriodIndexResampler, self)._get_binner_for_time()
+        return self.groupby._get_period_bins(self.ax)
+
     def _convert_obj(self, obj):
         obj = super(PeriodIndexResampler, self)._convert_obj(obj)
 
-        offset = to_offset(self.freq)
-        if offset.n > 1:
-            if self.kind == 'period':  # pragma: no cover
-                print('Warning: multiple of frequency -> timestamps')
-
-            # Cannot have multiple of periods, convert to timestamp
+        if self._from_selection:
+            # see GH 14008, GH 12871
+            msg = ("Resampling from level= or on= selection"
+                   " with a PeriodIndex is not currently supported,"
+                   " use .set_index(...) to explicitly set index")
+            raise NotImplementedError(msg)
+
+        if self.loffset is not None:
+            # Cannot apply loffset/timedelta to PeriodIndex -> convert to
+            # timestamps
             self.kind = 'timestamp'
 
         # convert to timestamp
-        if not (self.kind is None or self.kind == 'period'):
-            if self._from_selection:
-                # see GH 14008, GH 12871
-                msg = ("Resampling from level= or on= selection"
-                       " with a PeriodIndex is not currently supported,"
-                       " use .set_index(...) to explicitly set index")
-                raise NotImplementedError(msg)
-            else:
-                obj = obj.to_timestamp(how=self.convention)
+        if self.kind == 'timestamp':
+            obj = obj.to_timestamp(how=self.convention)
 
         return obj
 
-    def aggregate(self, arg, *args, **kwargs):
-        result, how = self._aggregate(arg, *args, **kwargs)
-        if result is None:
-            result = self._downsample(arg, *args, **kwargs)
-
-        result = self._apply_loffset(result)
-        return result
-
-    agg = aggregate
-
-    def _get_new_index(self):
-        """ return our new index """
-        ax = self.ax
-
-        if len(ax) == 0:
-            values = []
-        else:
-            start = ax[0].asfreq(self.freq, how=self.convention)
-            end = ax[-1].asfreq(self.freq, how='end')
-            values = period_range(start, end, freq=self.freq).asi8
-
-        return ax._shallow_copy(values, freq=self.freq)
-
     def _downsample(self, how, **kwargs):
         """
         Downsample the cython defined function
@@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs):
         how = self._is_cython_func(how) or how
         ax = self.ax
 
-        new_index = self._get_new_index()
-
-        # Start vs. end of period
-        memb = ax.asfreq(self.freq, how=self.convention)
-
         if is_subperiod(ax.freq, self.freq):
             # Downsampling
-            if len(new_index) == 0:
-                bins = []
-            else:
-                i8 = memb.asi8
-                rng = np.arange(i8[0], i8[-1] + 1)
-                bins = memb.searchsorted(rng, side='right')
-            grouper = BinGrouper(bins, new_index)
-            return self._groupby_and_aggregate(how, grouper=grouper)
+            return self._groupby_and_aggregate(how, grouper=self.grouper)
         elif is_superperiod(ax.freq, self.freq):
+            if how == 'ohlc':
+                # GH #13083
+                # upsampling to subperiods is handled as an asfreq, which works
+                # for pure aggregating/reducing methods
+                # OHLC reduces along the time dimension, but creates multiple
+                # values for each period -> handle by _groupby_and_aggregate()
+                return self._groupby_and_aggregate(how, grouper=self.grouper)
             return self.asfreq()
         elif ax.freq == self.freq:
             return self.asfreq()
@@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None):
         .fillna
 
         """
-        if self._from_selection:
-            raise ValueError("Upsampling from level= or on= selection"
-                             " is not supported, use .set_index(...)"
-                             " to explicitly set index to"
-                             " datetime-like")
+
         # we may need to actually resample as if we are timestamps
         if self.kind == 'timestamp':
             return super(PeriodIndexResampler, self)._upsample(
                 method, limit=limit, fill_value=fill_value)
 
+        self._set_binner()
         ax = self.ax
         obj = self.obj
-        new_index = self._get_new_index()
+        new_index = self.binner
 
         # Start vs. end of period
         memb = ax.asfreq(self.freq, how=self.convention)
@@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax):
 
         return binner, bins, labels
 
+    def _get_period_bins(self, ax):
+        if not isinstance(ax, PeriodIndex):
+            raise TypeError('axis must be a PeriodIndex, but got '
+                            'an instance of %r' % type(ax).__name__)
+
+        memb = ax.asfreq(self.freq, how=self.convention)
+
+        # NaT handling as in pandas._lib.lib.generate_bins_dt64()
+        nat_count = 0
+        if memb.hasnans:
+            nat_count = np.sum(memb._isnan)
+            memb = memb[~memb._isnan]
+
+        # if index contains no valid (non-NaT) values, return empty index
+        if not len(memb):
+            binner = labels = PeriodIndex(
+                data=[], freq=self.freq, name=ax.name)
+            return binner, [], labels
+
+        start = ax.min().asfreq(self.freq, how=self.convention)
+        end = ax.max().asfreq(self.freq, how='end')
+
+        labels = binner = PeriodIndex(start=start, end=end,
+                                      freq=self.freq, name=ax.name)
+
+        i8 = memb.asi8
+        freq_mult = self.freq.n
+
+        # when upsampling to subperiods, we need to generate enough bins
+        expected_bins_count = len(binner) * freq_mult
+        i8_extend = expected_bins_count - (i8[-1] - i8[0])
+        rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
+        rng += freq_mult
+        bins = memb.searchsorted(rng, side='left')
+
+        if nat_count > 0:
+            # NaT handling as in pandas._lib.lib.generate_bins_dt64()
+            # shift bins by the number of NaT
+            bins += nat_count
+            bins = np.insert(bins, 0, nat_count)
+            binner = binner.insert(0, tslib.NaT)
+            labels = labels.insert(0, tslib.NaT)
+
+        return binner, bins, labels
+
 
 def _take_new_index(obj, indexer, new_index, axis=0):
     from pandas.core.api import Series, DataFrame
diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py