Skip to content

Commit

Permalink
BUG: Fix some PeriodIndex resampling issues (#16153)
Browse files Browse the repository at this point in the history
closes #15944
xref #12884
closes #13083
closes #13224
  • Loading branch information
winklerand authored and jreback committed Oct 1, 2017
1 parent fd336fb commit 7d4a260
Show file tree
Hide file tree
Showing 3 changed files with 340 additions and 172 deletions.
76 changes: 76 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,82 @@ Other Enhancements
Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. _whatsnew_0210.api_breaking.period_index_resampling:

``PeriodIndex`` resampling
^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`)

Previous Behavior:

.. code-block:: ipython

In [1]: pi = pd.period_range('2017-01', periods=12, freq='M')

In [2]: s = pd.Series(np.arange(12), index=pi)

In [3]: resampled = s.resample('2Q').mean()

In [4]: resampled
Out[4]:
2017-03-31 1.0
2017-09-30 5.5
2018-03-31 10.0
Freq: 2Q-DEC, dtype: float64

In [5]: resampled.index
Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC')

New Behavior:

.. ipython:: python

pi = pd.period_range('2017-01', periods=12, freq='M')

s = pd.Series(np.arange(12), index=pi)

resampled = s.resample('2Q').mean()

resampled

resampled.index


Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior.

Previous Behavior:

.. code-block:: ipython

In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10)

In [2]: s = pd.Series(np.arange(10), index=pi)

In [3]: s.resample('H').ohlc()
Out[3]:
2000-01-01 00:00 0.0
...
2000-01-10 23:00 NaN
Freq: H, Length: 240, dtype: float64

In [4]: s.resample('M').ohlc()
Out[4]:
open high low close
2000-01 0 9 0 9

New Behavior:

.. ipython:: python

pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10)

s = pd.Series(np.arange(10), index=pi)

s.resample('H').ohlc()

s.resample('M').ohlc()


.. _whatsnew_0210.api_breaking.deps:

Expand Down
132 changes: 74 additions & 58 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
from pandas.core.indexes.timedeltas import TimedeltaIndex
from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds
from pandas.core.indexes.period import PeriodIndex, period_range
from pandas.core.indexes.period import PeriodIndex
import pandas.core.common as com
import pandas.core.algorithms as algos
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
Expand Down Expand Up @@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler):
def _resampler_for_grouping(self):
return PeriodIndexResamplerGroupby

def _get_binner_for_time(self):
if self.kind == 'timestamp':
return super(PeriodIndexResampler, self)._get_binner_for_time()
return self.groupby._get_period_bins(self.ax)

def _convert_obj(self, obj):
obj = super(PeriodIndexResampler, self)._convert_obj(obj)

offset = to_offset(self.freq)
if offset.n > 1:
if self.kind == 'period': # pragma: no cover
print('Warning: multiple of frequency -> timestamps')

# Cannot have multiple of periods, convert to timestamp
if self._from_selection:
# see GH 14008, GH 12871
msg = ("Resampling from level= or on= selection"
" with a PeriodIndex is not currently supported,"
" use .set_index(...) to explicitly set index")
raise NotImplementedError(msg)

if self.loffset is not None:
# Cannot apply loffset/timedelta to PeriodIndex -> convert to
# timestamps
self.kind = 'timestamp'

# convert to timestamp
if not (self.kind is None or self.kind == 'period'):
if self._from_selection:
# see GH 14008, GH 12871
msg = ("Resampling from level= or on= selection"
" with a PeriodIndex is not currently supported,"
" use .set_index(...) to explicitly set index")
raise NotImplementedError(msg)
else:
obj = obj.to_timestamp(how=self.convention)
if self.kind == 'timestamp':
obj = obj.to_timestamp(how=self.convention)

return obj

def aggregate(self, arg, *args, **kwargs):
result, how = self._aggregate(arg, *args, **kwargs)
if result is None:
result = self._downsample(arg, *args, **kwargs)

result = self._apply_loffset(result)
return result

agg = aggregate

def _get_new_index(self):
""" return our new index """
ax = self.ax

if len(ax) == 0:
values = []
else:
start = ax[0].asfreq(self.freq, how=self.convention)
end = ax[-1].asfreq(self.freq, how='end')
values = period_range(start, end, freq=self.freq).asi8

return ax._shallow_copy(values, freq=self.freq)

def _downsample(self, how, **kwargs):
"""
Downsample the cython defined function
Expand All @@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs):
how = self._is_cython_func(how) or how
ax = self.ax

new_index = self._get_new_index()

# Start vs. end of period
memb = ax.asfreq(self.freq, how=self.convention)

if is_subperiod(ax.freq, self.freq):
# Downsampling
if len(new_index) == 0:
bins = []
else:
i8 = memb.asi8
rng = np.arange(i8[0], i8[-1] + 1)
bins = memb.searchsorted(rng, side='right')
grouper = BinGrouper(bins, new_index)
return self._groupby_and_aggregate(how, grouper=grouper)
return self._groupby_and_aggregate(how, grouper=self.grouper)
elif is_superperiod(ax.freq, self.freq):
if how == 'ohlc':
# GH #13083
# upsampling to subperiods is handled as an asfreq, which works
# for pure aggregating/reducing methods
# OHLC reduces along the time dimension, but creates multiple
# values for each period -> handle by _groupby_and_aggregate()
return self._groupby_and_aggregate(how, grouper=self.grouper)
return self.asfreq()
elif ax.freq == self.freq:
return self.asfreq()
Expand All @@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None):
.fillna
"""
if self._from_selection:
raise ValueError("Upsampling from level= or on= selection"
" is not supported, use .set_index(...)"
" to explicitly set index to"
" datetime-like")

# we may need to actually resample as if we are timestamps
if self.kind == 'timestamp':
return super(PeriodIndexResampler, self)._upsample(
method, limit=limit, fill_value=fill_value)

self._set_binner()
ax = self.ax
obj = self.obj
new_index = self._get_new_index()
new_index = self.binner

# Start vs. end of period
memb = ax.asfreq(self.freq, how=self.convention)
Expand Down Expand Up @@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax):

return binner, bins, labels

def _get_period_bins(self, ax):
if not isinstance(ax, PeriodIndex):
raise TypeError('axis must be a PeriodIndex, but got '
'an instance of %r' % type(ax).__name__)

memb = ax.asfreq(self.freq, how=self.convention)

# NaT handling as in pandas._lib.lib.generate_bins_dt64()
nat_count = 0
if memb.hasnans:
nat_count = np.sum(memb._isnan)
memb = memb[~memb._isnan]

# if index contains no valid (non-NaT) values, return empty index
if not len(memb):
binner = labels = PeriodIndex(
data=[], freq=self.freq, name=ax.name)
return binner, [], labels

start = ax.min().asfreq(self.freq, how=self.convention)
end = ax.max().asfreq(self.freq, how='end')

labels = binner = PeriodIndex(start=start, end=end,
freq=self.freq, name=ax.name)

i8 = memb.asi8
freq_mult = self.freq.n

# when upsampling to subperiods, we need to generate enough bins
expected_bins_count = len(binner) * freq_mult
i8_extend = expected_bins_count - (i8[-1] - i8[0])
rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
rng += freq_mult
bins = memb.searchsorted(rng, side='left')

if nat_count > 0:
# NaT handling as in pandas._lib.lib.generate_bins_dt64()
# shift bins by the number of NaT
bins += nat_count
bins = np.insert(bins, 0, nat_count)
binner = binner.insert(0, tslib.NaT)
labels = labels.insert(0, tslib.NaT)

return binner, bins, labels


def _take_new_index(obj, indexer, new_index, axis=0):
from pandas.core.api import Series, DataFrame
Expand Down
Loading

0 comments on commit 7d4a260

Please sign in to comment.