Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Timestamp origin takes no effect in resample for 'MS' frequency #53938

Merged
merged 14 commits into from
Jul 25, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,7 @@ Plotting

Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` :class:`Datetimelike` ``origin`` has no effect in resample for ``MS`` frequency (:issue:`53662`)
mcgeestocks marked this conversation as resolved.
Show resolved Hide resolved
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`)
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`)
- Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9223,11 +9223,18 @@ def resample(
Freq: 17T, dtype: int64

>>> ts.resample('17min', origin='2000-01-01').sum()
2000-01-01 00:00:00 0
2000-01-01 00:17:00 0
2000-01-01 00:34:00 0
2000-01-01 00:51:00 0
2000-01-01 01:08:00 0
..
2000-10-01 23:07:00 0
2000-10-01 23:24:00 3
2000-10-01 23:41:00 15
2000-10-01 23:58:00 45
2000-10-02 00:15:00 45
Freq: 17T, dtype: int64
Freq: 17T, Length: 23296, dtype: int64
mcgeestocks marked this conversation as resolved.
Show resolved Hide resolved

If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,18 @@ class Grouper:
Freq: 17T, dtype: int64

>>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
2000-01-01 00:00:00 0
2000-01-01 00:17:00 0
2000-01-01 00:34:00 0
2000-01-01 00:51:00 0
2000-01-01 01:08:00 0
..
2000-10-01 23:07:00 0
2000-10-01 23:24:00 3
2000-10-01 23:41:00 15
2000-10-01 23:58:00 45
2000-10-02 00:15:00 45
Freq: 17T, dtype: int64
Freq: 17T, Length: 23296, dtype: int64
mcgeestocks marked this conversation as resolved.
Show resolved Hide resolved

If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -2457,8 +2457,15 @@ def _get_timestamp_range_edges(
"""
if isinstance(freq, Tick):
index_tz = first.tz
if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
raise ValueError("The origin must have the same timezone as the index.")
if isinstance(origin, Timestamp) and origin.tz != index_tz:
if first.utcoffset() != origin.utcoffset():
raise ValueError("The origin must have the same timezone as the index.")
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
elif isinstance(origin, Timestamp):
if origin <= first:
first = origin
elif origin >= last:
last = origin

if origin == "epoch":
# set the epoch based on the timezone to have similar bins results when
# resampling on the same kind of indexes on different timezones
Expand All @@ -2480,6 +2487,9 @@ def _get_timestamp_range_edges(
first = first.tz_localize(index_tz)
last = last.tz_localize(index_tz)
else:
if isinstance(origin, Timestamp):
first = origin

first = first.normalize()
last = last.normalize()

Expand Down
61 changes: 48 additions & 13 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,24 +790,34 @@ def test_resample_offset(unit):


@pytest.mark.parametrize(
"kwargs",
"kwargs, expected",
[
{"origin": "1999-12-31 23:57:00"},
{"origin": Timestamp("1970-01-01 00:02:00")},
{"origin": "epoch", "offset": "2m"},
(
{"origin": "1999-12-31 23:57:00"},
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
),
(
{"origin": Timestamp("1970-01-01 00:02:00")},
["1970-01-01 00:02:00", "2000-01-01 01:57:00"],
),
(
{"origin": "epoch", "offset": "2m"},
["1999-12-31 23:57:00", "2000-01-01 01:57:00"],
),
# origin of '1999-31-12 12:02:00' should be equivalent for this case
{"origin": "1999-12-31 12:02:00"},
{"offset": "-3m"},
(
{"origin": "1999-12-31 12:02:00"},
["1999-12-31 12:02:00", "2000-01-01 01:57:00"],
),
({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]),
],
)
def test_resample_origin(kwargs, unit):
def test_resample_origin(kwargs, unit, expected):
# GH 31809
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
ts = Series(np.random.randn(len(rng)), index=rng)

exp_rng = date_range(
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
).as_unit(unit)
exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit)

resampled = ts.resample("5min", **kwargs).mean()
tm.assert_index_equal(resampled.index, exp_rng)
Expand Down Expand Up @@ -837,6 +847,31 @@ def test_resample_bad_offset(offset, unit):
ts.resample("5min", offset=offset)


def test_resample_monthstart_origin():
# GH 53662
df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]})
result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum()
excepted = Series(
[10.0],
index=DatetimeIndex(
["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS"
),
)
tm.assert_index_equal(result.index, excepted.index)

df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]})
result = df.resample(
"3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1)
)["values"].sum()
expected = Series(
[0, 10.0],
index=DatetimeIndex(
["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS"
),
)
tm.assert_index_equal(result.index, expected.index)


def test_resample_origin_prime_freq(unit):
# GH 31809
start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
Expand Down Expand Up @@ -868,7 +903,7 @@ def test_resample_origin_prime_freq(unit):
tm.assert_index_equal(resampled.index, exp_rng)

exp_rng = date_range(
"2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
"2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min"
).as_unit(unit)
resampled = ts.resample("17min", origin="2000-01-01").mean()
tm.assert_index_equal(resampled.index, exp_rng)
Expand All @@ -887,11 +922,11 @@ def test_resample_origin_with_tz(unit):
exp_rng = date_range(
"1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
).as_unit(unit)
resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
resampled = ts.resample("5min", origin="1999-12-31 23:57:00+01:00").mean()
tm.assert_index_equal(resampled.index, exp_rng)

# origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
resampled = ts.resample("5min", origin="1999-12-31 12:02:00+01:00").mean()
tm.assert_index_equal(resampled.index, exp_rng)

resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
Expand Down
32 changes: 19 additions & 13 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,19 @@ def test_groupby_with_origin():
start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
middle = "1/15/2000 00:00:00"

# test origin on 1970-01-01 00:00:00
rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number
ts = Series(np.random.randn(len(rng)), index=rng)
middle_ts = rng[len(rng) // 2]
ts2 = ts[middle_ts:end]

origin = Timestamp(0)
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
adjusted_count_ts = adjusted_count_ts[middle_ts:end]
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end])

rng = date_range(start, end, freq="1231min") # prime number
ts = Series(np.random.randn(len(rng)), index=rng)
ts2 = ts[middle:end]
Expand All @@ -154,26 +167,19 @@ def test_groupby_with_origin():
with pytest.raises(AssertionError, match="Index are different"):
tm.assert_index_equal(count_ts.index, count_ts2.index)

# test origin on 1970-01-01 00:00:00
origin = Timestamp(0)
adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
adjusted_count_ts = adjusted_count_ts[middle:end]
adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)

# test origin on 2049-10-18 20:00:00

rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number
ts = Series(np.random.randn(len(rng)), index=rng)
middle_ts = rng[len(rng) // 2]
ts2 = ts[middle_ts:end]
origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
adjusted2_count_ts = adjusted2_count_ts[middle:end]
adjusted2_count_ts = adjusted2_count_ts[middle_ts:end]
adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)

# both grouper use an adjusted timestamp that is a multiple of 1399 min
# they should be equals even if the adjusted_timestamp is in the future
tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)


def test_nearest():
# GH 17496
Expand Down