Skip to content

Fix interpolate limit area and limit direction with pad #35893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9afe992
Added failing test for https://github.com/pandas-dev/pandas/issues/26796
cchwala Jan 15, 2020
3a191b9
Added implementation to support `limit_area`
cchwala Jan 15, 2020
fd5d8e8
fix test
cchwala Jan 15, 2020
26d88ed
pep8
cchwala Jan 15, 2020
6597aca
fixed small error that actually had no effect since the input array `…
cchwala Jan 15, 2020
c536d3c
Raise when forbidden combination of `method` and `limit_direction` ar…
cchwala Jan 15, 2020
ed9cf21
Updated docstring with info about allowed combinations of `method` an…
cchwala Jan 15, 2020
2980325
clean up
cchwala Jan 15, 2020
ecf428e
Added entry to whatsnew file
cchwala Jan 15, 2020
f8a3423
Removed `axis` kwarg from `interpolate_1d_fill` because it was unused
cchwala Jan 15, 2020
6733186
Type annotations added to new function `interpolate_1d_fill`
cchwala Jan 15, 2020
c5b77d2
fixed incorrectly sorted imports
cchwala Jan 15, 2020
0bb36de
Added type annotation, updated docstring and removed unnecessary argu…
cchwala Feb 5, 2020
a467afd
Reverting docstring entry for default value of `limit_direction`
cchwala Feb 18, 2020
5466d8c
Moved logic for calling `missing.interpolate_1d_fill` to `missing.int…
cchwala Feb 18, 2020
3e968fc
Moved whatsnew entry to v1.1.0.rst
cchwala Feb 18, 2020
556a3cf
clean up
cchwala Feb 18, 2020
6c1e429
fixed missing Optional in type definition
cchwala Feb 18, 2020
767b0ca
small fix so that CI type validation does not complain
cchwala Mar 16, 2020
b82aaff
Merge remote-tracking branch 'upstream/master' into fix_interpolate_l…
cchwala Mar 17, 2020
26ef7b5
Apply suggestions from code review concerning list instead of set
cchwala Mar 19, 2020
b4b6b5a
added import for missing List type
cchwala Mar 19, 2020
e259549
fixed unsorted order of imports
cchwala Mar 19, 2020
8f89508
Merge remote-tracking branch 'upstream/master' into fix_interpolate_l…
simonjayhawkins Jun 12, 2020
e3f0ab5
Merge remote-tracking branch 'upstream/master' into fix_interpolate_l…
simonjayhawkins Jun 13, 2020
669772f
Merge remote-tracking branch 'upstream/master' into fix_interpolate_l…
simonjayhawkins Jul 6, 2020
594f818
Merge remote-tracking branch 'upstream/master' into fix_interpolate_l…
simonjayhawkins Aug 25, 2020
2eca786
move whatsnew
simonjayhawkins Aug 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ Missing
^^^^^^^

- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`)
- Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and `backfill`` (:issue:`31048`)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and `backfill`` (:issue:`31048`)
- Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`)

-

MultiIndex
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,7 @@ def interpolate(
axis=axis,
inplace=inplace,
limit=limit,
limit_area=limit_area,
fill_value=fill_value,
coerce=coerce,
downcast=downcast,
Expand Down Expand Up @@ -1155,6 +1156,7 @@ def _interpolate_with_fill(
axis: int = 0,
inplace: bool = False,
limit: Optional[int] = None,
limit_area=None,
fill_value: Optional[Any] = None,
coerce: bool = False,
downcast: Optional[str] = None,
Expand All @@ -1176,16 +1178,17 @@ def _interpolate_with_fill(
# We only get here for non-ExtensionBlock
fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype)

values = missing.interpolate_2d(
interp_values = missing.interpolate_2d(
values,
method=method,
axis=axis,
limit=limit,
fill_value=fill_value,
limit_area=limit_area,
dtype=self.dtype,
)

blocks = [self.make_block_same_class(values, ndim=self.ndim)]
blocks = [self.make_block_same_class(interp_values, ndim=self.ndim)]
return self._maybe_downcast(blocks, downcast)

def _interpolate(
Expand Down
233 changes: 177 additions & 56 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
Routines for filling missing data.
"""

from typing import Any, List, Optional, Set, Union
from typing import Any, List, Optional

import numpy as np

from pandas._libs import algos, lib
from pandas._typing import ArrayLike, Dtype, Hashable
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.cast import infer_dtype_from_array
Expand Down Expand Up @@ -230,41 +231,12 @@ def interpolate_1d(
# default limit is unlimited GH #16282
limit = algos._validate_limit(nobs=None, limit=limit)

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,
# but in this case, it is the final set of indices that need to be
# preserved as NaN after the interpolation.

# For example if limit_direction='forward' then preserve_nans will
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.

# set preserve_nans based on direction using _interp_limit
preserve_nans: Union[List, Set]
if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))

# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans

# sort preserve_nans and covert to list
preserve_nans = sorted(preserve_nans)
preserve_nans = _derive_indices_of_nans_to_preserve(
yvalues=yvalues,
limit=limit,
limit_area=limit_area,
limit_direction=limit_direction,
)

yvalues = getattr(yvalues, "values", yvalues)
result = yvalues.copy()
Expand Down Expand Up @@ -307,6 +279,73 @@ def interpolate_1d(
return result


def _derive_indices_of_nans_to_preserve(
yvalues: ArrayLike,
limit: Optional[int] = None,
limit_area: Optional[str] = None,
limit_direction: Optional[str] = None,
) -> List[int]:
"""
Derive the indices of NaNs that shall be preserved after interpolation
This function is called by `interpolate_1d` and takes the arguments with
the same name from there. In `interpolate_1d`, after performing the
interpolation, the list of indices of NaNs to preserve is used to put
NaNs in the desired locations.

Parameters
----------
yvalues: ArrayLike
1-d array of values of the initial Series or DataFrame
limit: int
limit_area: str
limit_direction: str

Returns
-------
preserve_nans: list of int
Set of index pointers to where NaNs should be preserved in `yvalues`
"""

invalid = isna(yvalues)
valid = ~invalid

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,
# but in this case, it is the final set of indices that need to be
# preserved as NaN after the interpolation.

# For example if limit_direction='forward' then preserve_nans will
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.

# set preserve_nans based on direction using _interp_limit
if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))

# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans

# sort preserve_nans and covert to list
preserve_nans_sorted = sorted(preserve_nans)
return preserve_nans_sorted


def _interpolate_scipy_wrapper(
x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs
):
Expand Down Expand Up @@ -542,45 +581,127 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat
return P(x)


def interpolate_2d(
values, method="pad", axis=0, limit=None, fill_value=None, dtype=None
def interpolate_1d_fill(
values,
method: str = "pad",
limit: Optional[int] = None,
limit_area: Optional[str] = None,
fill_value: Optional[Hashable] = None,
dtype: Optional[Dtype] = None,
):
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
This is a 1D-versoin of `interpolate_2d`, which is used for methods `pad`
and `backfill` when interpolating. This 1D-version is necessary to be
able to handle kwarg `limit_area` via the function
` _derive_indices_of_nans_to_preserve`. It is used the same way as the
1D-interpolation functions which are based on scipy-interpolation, i.e.
via np.apply_along_axis.
"""
if method == "pad":
limit_direction = "forward"
elif method == "backfill":
limit_direction = "backward"
else:
raise ValueError("`method` must be either 'pad' or 'backfill'.")

orig_values = values

transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
yvalues = values

# reshape a 1 dim if needed
ndim = values.ndim
if values.ndim == 1:
if axis != 0: # pragma: no cover
raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
values = values.reshape(tuple((1,) + values.shape))
if values.ndim > 1:
raise AssertionError("This only works with 1D data.")

if fill_value is None:
mask = None
else: # todo create faster fill func without masking
mask = mask_missing(transf(values), fill_value)
mask = mask_missing(values, fill_value)

preserve_nans = _derive_indices_of_nans_to_preserve(
yvalues=yvalues,
limit=limit,
limit_area=limit_area,
limit_direction=limit_direction,
)

method = clean_fill_method(method)
if method == "pad":
values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
values = pad_1d(values, limit=limit, mask=mask, dtype=dtype)
else:
values = transf(
backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
)

# reshape back
if ndim == 1:
values = values[0]
values = backfill_1d(values, limit=limit, mask=mask, dtype=dtype)

if orig_values.dtype.kind == "M":
# convert float back to datetime64
values = values.astype(orig_values.dtype)

values[preserve_nans] = fill_value
return values


def interpolate_2d(
values,
method="pad",
axis=0,
limit=None,
fill_value=None,
limit_area=None,
dtype=None,
):
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
"""
orig_values = values

# We have to distinguish two cases:
# 1. When kwarg `limit_area` is used: It is not
# supported by `pad_2d` and `backfill_2d`. Using this kwarg only
# works by applying the fill along a certain axis.
# 2. All other cases.
if limit_area is not None:

def func(x):
return interpolate_1d_fill(
x,
method=method,
limit=limit,
limit_area=limit_area,
fill_value=fill_value,
dtype=dtype,
)

# Beware that this also changes the input array `values`!
values = np.apply_along_axis(func, axis, values)
else:
transf = (lambda x: x) if axis == 0 else (lambda x: x.T)

# reshape a 1 dim if needed
ndim = values.ndim
if values.ndim == 1:
if axis != 0: # pragma: no cover
raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
values = values.reshape(tuple((1,) + values.shape))

if fill_value is None:
mask = None
else: # todo create faster fill func without masking
mask = mask_missing(transf(values), fill_value)

method = clean_fill_method(method)
if method == "pad":
values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
else:
values = transf(
backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
)

# reshape back
if ndim == 1:
values = values[0]

if orig_values.dtype.kind == "M":
# convert float back to datetime64
values = values.astype(orig_values.dtype)

return values


Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/series/methods/test_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,54 @@ def test_interp_limit_area(self):
with pytest.raises(ValueError, match=msg):
s.interpolate(method="linear", limit_area="abc")

def test_interp_limit_area_with_pad(self):
# Test for issue #26796
s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan])

expected = Series([np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan])
result = s.interpolate(method="pad", limit_area="inside")
tm.assert_series_equal(result, expected)

expected = Series(
[np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan]
)
result = s.interpolate(method="pad", limit_area="inside", limit=1)
tm.assert_series_equal(result, expected)

expected = Series([np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0])
result = s.interpolate(method="pad", limit_area="outside")
tm.assert_series_equal(result, expected)

expected = Series(
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan]
)
result = s.interpolate(method="pad", limit_area="outside", limit=1)
tm.assert_series_equal(result, expected)

def test_interp_limit_area_with_backfill(self):
# Test for issue #26796
s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan])

expected = Series([np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan])
result = s.interpolate(method="bfill", limit_area="inside")
tm.assert_series_equal(result, expected)

expected = Series(
[np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan]
)
result = s.interpolate(method="bfill", limit_area="inside", limit=1)
tm.assert_series_equal(result, expected)

expected = Series([3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan])
result = s.interpolate(method="bfill", limit_area="outside")
tm.assert_series_equal(result, expected)

expected = Series(
[np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan]
)
result = s.interpolate(method="bfill", limit_area="outside", limit=1)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"method, limit_direction, expected",
[
Expand Down