Skip to content

Fix/time series interpolation is wrong 21351 #56515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ff6d12f
fix: Fixes wrong doctest output in `pandas.core.resample.Resampler.in…
cbpygit Jan 2, 2024
1593af0
Resolved merge conflicts
cbpygit Jan 2, 2024
db68c2d
fix: Fixes wrong test case assumption for interpolation
cbpygit Dec 15, 2023
dd8b8d3
fix: Make sure frequency indexes are preserved with new interpolation…
cbpygit Dec 15, 2023
a04a3a2
fix: Fixes new-style up-sampling interpolation for MultiIndexes resul…
cbpygit Dec 21, 2023
efbba10
fix: Fixes wrong test case assumption when using linear interpolation…
cbpygit Jan 2, 2024
0294464
fix: Fixes wrong test case assumption when using linear interpolation…
cbpygit Jan 2, 2024
537f8bf
fix: Adds test skips for interpolation methods that require scipy if …
cbpygit Jan 2, 2024
901701c
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Mar 6, 2024
4f78c75
fix: Makes sure keyword arguments "downcast" is not passed to scipy i…
cbpygit Mar 6, 2024
a5bcd45
fix: Adjusted expected warning type in `test_groupby_resample_interpo…
cbpygit Mar 6, 2024
7d4b4ce
fix: Fixes failing interpolation on groupby if the index has `name`=N…
cbpygit Mar 6, 2024
dbae717
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Mar 8, 2024
05be840
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Mar 12, 2024
0912249
Trigger Actions
cbpygit Mar 12, 2024
49a7c4c
Merge remote-tracking branch 'origin/fix/time-series-interpolation-is…
cbpygit Mar 12, 2024
a5a7299
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Mar 14, 2024
6a6fa88
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Mar 15, 2024
4ebed74
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
MarcoGorelli Mar 20, 2024
0ee5b8d
feat: Raise error on attempt to interpolate a MultiIndex data frame, …
cbpygit Apr 2, 2024
b2bc373
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 2, 2024
6109102
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 3, 2024
d6af64a
Apply suggestions from code review
cbpygit Apr 4, 2024
2a86a27
refactor: Adjusted error type assertion in test case
cbpygit Apr 4, 2024
9c90e23
refactor: Removed unused parametrization definitions and switched to …
cbpygit Apr 4, 2024
4b2f3dc
fix: Adds forgotten "@" before pytest.mark.parametrize
cbpygit Apr 4, 2024
d11c162
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 4, 2024
789c511
refactor: Apply suggestions from code review
cbpygit Apr 4, 2024
4f6d102
refactor: Switched to ficture params syntax for test case parametriza…
cbpygit Apr 4, 2024
c0547b5
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 7, 2024
d6382f8
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 9, 2024
4e9a616
Update pandas/tests/resample/test_time_grouper.py
cbpygit Apr 13, 2024
c655bf1
Update pandas/tests/resample/test_base.py
cbpygit Apr 13, 2024
e916da9
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 14, 2024
eaa7e07
refactor: Fixes too long line
cbpygit Apr 14, 2024
649bfa2
tests: Fixes test that fails due to unimportant index name comparison
cbpygit Apr 14, 2024
4cfbbf1
docs: Added entry in whatsnew
cbpygit Apr 24, 2024
76794e3
Empty-Commit
cbpygit Apr 24, 2024
6ad9b26
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 24, 2024
6555141
Empty-Commit
cbpygit Apr 24, 2024
48850cc
Empty-Commit
cbpygit Apr 24, 2024
8eea71c
Merge branch 'main' into fix/time-series-interpolation-is-wrong-21351
cbpygit Apr 24, 2024
7f957cf
docs: Sorted whatsnew
cbpygit Apr 24, 2024
51e95e0
Merge remote-tracking branch 'origin/fix/time-series-interpolation-is…
cbpygit Apr 24, 2024
12bdd90
docs: Adjusted bug fix note and moved it to the right section
cbpygit Apr 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Resolved merge conflicts
  • Loading branch information
cbpygit committed Jan 2, 2024
commit 1593af0fe8530ea994b6789e5b819ff0d6b742b3
11 changes: 10 additions & 1 deletion pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,16 @@ def get_interp_index(method, index: Index) -> Index:
# prior default
from pandas import Index

index = Index(np.arange(len(index)))
if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype(
index.dtype, "mM"
):
# Convert datetime-like indexes to int64
index = Index(index.view("i8"))

elif not is_numeric_dtype(index.dtype):
# We keep behavior consistent with prior versions of pandas for
# non-numeric, non-datetime indexes
index = Index(np.arange(len(index)))
else:
methods = {"index", "values", "nearest", "time"}
is_numeric_or_datetime = (
Expand Down
25 changes: 24 additions & 1 deletion pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
TimedeltaIndex,
timedelta_range,
)
from pandas.core.reshape.concat import concat

from pandas.tseries.frequencies import (
is_subperiod,
Expand Down Expand Up @@ -1086,7 +1087,23 @@ def interpolate(
"""
assert downcast is lib.no_default # just checking coverage
result = self._upsample("asfreq")
return result.interpolate(

# If the original data has timestamps which are not aligned with the
# target timestamps, we need to add those points back to the data frame
# that is supposed to be interpolated. This does not work with
# PeriodIndex, so we skip this case.
obj = self._selected_obj
is_period_index = isinstance(obj.index, PeriodIndex)

if not is_period_index:
final_index = result.index
missing_data_points_index = obj.index.difference(final_index)
if len(missing_data_points_index) > 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in the opposite case where the difference is empty, i think a the sort_index and .loc below (which makes a copy) can be avoided

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel Can you please clarify? "in the opposite case where the difference is empty" --> then missing_data_points_index will be empty, so its length will be zero. In that case, the if-expression evaluates to False, so that the loc/sort is indeed not executed. Maybe I am overlooking something.

result = concat(
[result, obj.loc[missing_data_points_index]]
).sort_index()

result_interpolated = result.interpolate(
method=method,
axis=axis,
limit=limit,
Expand All @@ -1097,6 +1114,12 @@ def interpolate(
**kwargs,
)

# We make sure that original data points which do not align with the
# resampled index are removed
if is_period_index:
return result_interpolated
return result_interpolated.loc[final_index]

@final
def asfreq(self, fill_value=None):
"""
Expand Down
92 changes: 92 additions & 0 deletions pandas/tests/resample/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,51 @@
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.resample import _asfreq_compat

# a fixture value can be overridden by the test parameter value. Note that the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you just directly parameterize tests instead of using this pattern? This pattern was removed in the last release or so

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done and removed the unused parametrizations

# value of the fixture can be overridden this way even if the test doesn't use
# it directly (doesn't mention it in the function prototype).
# see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa: E501
# in this module we override the fixture values defined in conftest.py
# tuples of '_index_factory,_series_name,_index_start,_index_end'
DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10))
PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10))
TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day")

all_ts = pytest.mark.parametrize(
"_index_factory,_series_name,_index_start,_index_end",
[DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE],
)

all_1d_no_arg_interpolation_methods = pytest.mark.parametrize(
"method",
[
"linear",
"time",
"index",
"values",
"nearest",
"zero",
"slinear",
"quadratic",
"cubic",
"barycentric",
"krogh",
"from_derivatives",
"piecewise_polynomial",
"pchip",
"akima",
],
)


@pytest.fixture
def create_index(_index_factory):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't think this will be used anymore

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed

def _create_index(*args, **kwargs):
"""return the _index_factory created using the args, kwargs"""
return _index_factory(*args, **kwargs)

return _create_index


@pytest.mark.parametrize("freq", ["2D", "1h"])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -89,6 +134,53 @@ def test_resample_interpolate(index):
tm.assert_frame_equal(result, expected)


@all_1d_no_arg_interpolation_methods
def test_resample_interpolate_regular_sampling_off_grid(method):
# GH#21351
index = date_range("2000-01-01 00:01:00", periods=5, freq="2h")
ser = Series(np.arange(5.0), index)

# Resample to 1 hour sampling and interpolate with the given method
ser_resampled = ser.resample("1h").interpolate(method)

# Check that none of the resampled values are NaN, except the first one
# which lies 1 minute before the first actual data point
assert np.isnan(ser_resampled.iloc[0])
assert not ser_resampled.iloc[1:].isna().any()

if method not in ["nearest", "zero"]:
# Check that the resampled values are close to the expected values
# except for methods with known inaccuracies
assert np.all(
np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1)
)


@all_1d_no_arg_interpolation_methods
def test_resample_interpolate_irregular_sampling(method):
# GH#21351
ser = Series(
np.linspace(0.0, 1.0, 5),
index=DatetimeIndex(
[
"2000-01-01 00:00:03",
"2000-01-01 00:00:22",
"2000-01-01 00:00:24",
"2000-01-01 00:00:31",
"2000-01-01 00:00:39",
]
),
)

# Resample to 5 second sampling and interpolate with the given method
ser_resampled = ser.resample("5s").interpolate(method)

# Check that none of the resampled values are NaN, except the first one
# which lies 3 seconds before the first actual data point
assert np.isnan(ser_resampled.iloc[0])
assert not ser_resampled.iloc[1:].isna().any()


def test_raises_on_non_datetimelike_index():
# this is a non datetimelike index
xp = DataFrame()
Expand Down