Skip to content

Commit 6a28403

Browse files
feat: add 'index', 'pad', 'nearest' interpolate methods (#162)
* feat: add 'index', 'pad', 'nearest' interpolate methods
1 parent 765446a commit 6a28403

File tree

6 files changed

+243
-90
lines changed

6 files changed

+243
-90
lines changed

bigframes/core/block_transforms.py

Lines changed: 160 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import bigframes.core.blocks as blocks
2323
import bigframes.core.ordering as ordering
2424
import bigframes.core.window_spec as windows
25+
import bigframes.dtypes as dtypes
2526
import bigframes.operations as ops
2627
import bigframes.operations.aggregations as agg_ops
2728

@@ -106,67 +107,59 @@ def indicate_duplicates(
106107

107108

108109
def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
109-
if method != "linear":
110+
supported_methods = [
111+
"linear",
112+
"values",
113+
"index",
114+
"nearest",
115+
"zero",
116+
"slinear",
117+
]
118+
if method not in supported_methods:
110119
raise NotImplementedError(
111-
f"Only 'linear' interpolate method supported. {constants.FEEDBACK_LINK}"
120+
f"Method {method} not supported, following interpolate methods supported: {', '.join(supported_methods)}. {constants.FEEDBACK_LINK}"
112121
)
113-
backwards_window = windows.WindowSpec(following=0)
114-
forwards_window = windows.WindowSpec(preceding=0)
115-
116122
output_column_ids = []
117123

118124
original_columns = block.value_columns
119125
original_labels = block.column_labels
120-
block, offsets = block.promote_offsets()
126+
127+
if method == "linear": # Assumes evenly spaced, ignore index
128+
block, xvalues = block.promote_offsets()
129+
else:
130+
index_columns = block.index_columns
131+
if len(index_columns) != 1:
132+
raise ValueError("only method 'linear' supports multi-index")
133+
xvalues = block.index_columns[0]
134+
if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES:
135+
raise ValueError("Can only interpolate on numeric index.")
136+
121137
for column in original_columns:
122138
# null in same places column is null
123139
should_interpolate = block._column_type(column) in [
124140
pd.Float64Dtype(),
125141
pd.Int64Dtype(),
126142
]
127143
if should_interpolate:
128-
block, notnull = block.apply_unary_op(column, ops.notnull_op)
129-
block, masked_offsets = block.apply_binary_op(
130-
offsets, notnull, ops.partial_arg3(ops.where_op, None)
131-
)
132-
133-
block, previous_value = block.apply_window_op(
134-
column, agg_ops.LastNonNullOp(), backwards_window
135-
)
136-
block, next_value = block.apply_window_op(
137-
column, agg_ops.FirstNonNullOp(), forwards_window
138-
)
139-
block, previous_value_offset = block.apply_window_op(
140-
masked_offsets,
141-
agg_ops.LastNonNullOp(),
142-
backwards_window,
143-
skip_reproject_unsafe=True,
144-
)
145-
block, next_value_offset = block.apply_window_op(
146-
masked_offsets,
147-
agg_ops.FirstNonNullOp(),
148-
forwards_window,
149-
skip_reproject_unsafe=True,
150-
)
151-
152-
block, prediction_id = _interpolate(
144+
interpolate_method_map = {
145+
"linear": "linear",
146+
"values": "linear",
147+
"index": "linear",
148+
"slinear": "linear",
149+
"zero": "ffill",
150+
"nearest": "nearest",
151+
}
152+
extrapolating_methods = ["linear", "values", "index"]
153+
interpolate_method = interpolate_method_map[method]
154+
do_extrapolate = method in extrapolating_methods
155+
block, interpolated = _interpolate_column(
153156
block,
154-
previous_value_offset,
155-
previous_value,
156-
next_value_offset,
157-
next_value,
158-
offsets,
157+
column,
158+
xvalues,
159+
interpolate_method=interpolate_method,
160+
do_extrapolate=do_extrapolate,
159161
)
160-
161-
block, interpolated_column = block.apply_binary_op(
162-
column, prediction_id, ops.fillna_op
163-
)
164-
# Pandas performs ffill-like behavior to extrapolate forwards
165-
block, interpolated_and_ffilled = block.apply_binary_op(
166-
interpolated_column, previous_value, ops.fillna_op
167-
)
168-
169-
output_column_ids.append(interpolated_and_ffilled)
162+
output_column_ids.append(interpolated)
170163
else:
171164
output_column_ids.append(column)
172165

@@ -175,7 +168,80 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
175168
return block.with_column_labels(original_labels)
176169

177170

178-
def _interpolate(
171+
def _interpolate_column(
172+
block: blocks.Block,
173+
column: str,
174+
x_values: str,
175+
interpolate_method: str,
176+
do_extrapolate: bool = True,
177+
) -> typing.Tuple[blocks.Block, str]:
178+
if interpolate_method not in ["linear", "nearest", "ffill"]:
179+
raise ValueError("interpolate method not supported")
180+
window_ordering = (ordering.OrderingColumnReference(x_values),)
181+
backwards_window = windows.WindowSpec(following=0, ordering=window_ordering)
182+
forwards_window = windows.WindowSpec(preceding=0, ordering=window_ordering)
183+
184+
# Note, this method may
185+
block, notnull = block.apply_unary_op(column, ops.notnull_op)
186+
block, masked_offsets = block.apply_binary_op(
187+
x_values, notnull, ops.partial_arg3(ops.where_op, None)
188+
)
189+
190+
block, previous_value = block.apply_window_op(
191+
column, agg_ops.LastNonNullOp(), backwards_window
192+
)
193+
block, next_value = block.apply_window_op(
194+
column, agg_ops.FirstNonNullOp(), forwards_window
195+
)
196+
block, previous_value_offset = block.apply_window_op(
197+
masked_offsets,
198+
agg_ops.LastNonNullOp(),
199+
backwards_window,
200+
skip_reproject_unsafe=True,
201+
)
202+
block, next_value_offset = block.apply_window_op(
203+
masked_offsets,
204+
agg_ops.FirstNonNullOp(),
205+
forwards_window,
206+
skip_reproject_unsafe=True,
207+
)
208+
209+
if interpolate_method == "linear":
210+
block, prediction_id = _interpolate_points_linear(
211+
block,
212+
previous_value_offset,
213+
previous_value,
214+
next_value_offset,
215+
next_value,
216+
x_values,
217+
)
218+
elif interpolate_method == "nearest":
219+
block, prediction_id = _interpolate_points_nearest(
220+
block,
221+
previous_value_offset,
222+
previous_value,
223+
next_value_offset,
224+
next_value,
225+
x_values,
226+
)
227+
else: # interpolate_method == 'ffill':
228+
block, prediction_id = _interpolate_points_ffill(
229+
block,
230+
previous_value_offset,
231+
previous_value,
232+
next_value_offset,
233+
next_value,
234+
x_values,
235+
)
236+
if do_extrapolate:
237+
block, prediction_id = block.apply_binary_op(
238+
prediction_id, previous_value, ops.fillna_op
239+
)
240+
241+
return block.apply_binary_op(column, prediction_id, ops.fillna_op)
242+
243+
244+
def _interpolate_points_linear(
179245
block: blocks.Block,
180246
x0_id: str,
181247
y0_id: str,
@@ -196,6 +262,53 @@ def _interpolate(
196262
return block, prediction_id
197263

198264

265+
def _interpolate_points_nearest(
266+
block: blocks.Block,
267+
x0_id: str,
268+
y0_id: str,
269+
x1_id: str,
270+
y1_id: str,
271+
xpredict_id: str,
272+
) -> typing.Tuple[blocks.Block, str]:
273+
"""Interpolate by taking the y value of the nearest x value"""
274+
block, left_diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op)
275+
block, right_diff = block.apply_binary_op(x1_id, xpredict_id, ops.sub_op)
276+
# If diffs equal, choose left
277+
block, choose_left = block.apply_binary_op(left_diff, right_diff, ops.le_op)
278+
block, choose_left = block.apply_unary_op(
279+
choose_left, ops.partial_right(ops.fillna_op, False)
280+
)
281+
282+
block, nearest = block.apply_ternary_op(y0_id, choose_left, y1_id, ops.where_op)
283+
284+
block, y0_exists = block.apply_unary_op(y0_id, ops.notnull_op)
285+
block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op)
286+
block, is_interpolation = block.apply_binary_op(y0_exists, y1_exists, ops.and_op)
287+
288+
block, prediction_id = block.apply_binary_op(
289+
nearest, is_interpolation, ops.partial_arg3(ops.where_op, None)
290+
)
291+
292+
return block, prediction_id
293+
294+
295+
def _interpolate_points_ffill(
296+
block: blocks.Block,
297+
x0_id: str,
298+
y0_id: str,
299+
x1_id: str,
300+
y1_id: str,
301+
xpredict_id: str,
302+
) -> typing.Tuple[blocks.Block, str]:
303+
"""Interpolates by using the preceding values"""
304+
# check for existance of y1, otherwise we are extrapolating instead of interpolating
305+
block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op)
306+
block, prediction_id = block.apply_binary_op(
307+
y0_id, y1_exists, ops.partial_arg3(ops.where_op, None)
308+
)
309+
return block, prediction_id
310+
311+
199312
def drop_duplicates(
200313
block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
201314
) -> blocks.Block:

bigframes/dataframe.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,6 +1450,8 @@ def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = No
14501450
return self.reindex(index=other.index, columns=other.columns, validate=validate)
14511451

14521452
def interpolate(self, method: str = "linear") -> DataFrame:
1453+
if method == "pad":
1454+
return self.ffill()
14531455
result = block_ops.interpolate(self._block, method)
14541456
return DataFrame(result)
14551457

bigframes/series.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,8 @@ def replace(
475475
return Series(block.select_column(result_col))
476476

477477
def interpolate(self, method: str = "linear") -> Series:
478+
if method == "pad":
479+
return self.ffill()
478480
result = block_ops.interpolate(self._block, method)
479481
return Series(result)
480482

tests/system/small/test_series.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -273,21 +273,26 @@ def test_series_replace_list_scalar(scalars_dfs):
273273

274274

275275
@pytest.mark.parametrize(
276-
("values",),
276+
("method",),
277277
(
278-
([None, 1, 2, None, None, 16, None],),
279-
([None, None, 3.6, None],),
280-
([403.2, None, 352.1, None, None, 111.9],),
278+
("linear",),
279+
("values",),
280+
("slinear",),
281+
("nearest",),
282+
("zero",),
283+
("pad",),
281284
),
282285
)
283-
def test_series_interpolate(values):
284-
pd_series = pd.Series(values)
286+
def test_series_interpolate(method):
287+
values = [None, 1, 2, None, None, 16, None]
288+
index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
289+
pd_series = pd.Series(values, index)
285290
bf_series = series.Series(pd_series)
286291

287292
# Pandas can only interpolate on "float64" columns
288293
# https://github.com/pandas-dev/pandas/issues/40252
289-
pd_result = pd_series.astype("float64").interpolate()
290-
bf_result = bf_series.interpolate().to_pandas()
294+
pd_result = pd_series.astype("float64").interpolate(method=method)
295+
bf_result = bf_series.interpolate(method=method).to_pandas()
291296

292297
# pd uses non-null types, while bf uses nullable types
293298
pd.testing.assert_series_equal(

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2872,17 +2872,6 @@ def interpolate(self, method: str = "linear"):
28722872
"""
28732873
Fill NaN values using an interpolation method.
28742874
2875-
Args:
2876-
method (str, default 'linear'):
2877-
Interpolation technique to use. Only 'linear' supported.
2878-
'linear': Ignore the index and treat the values as equally spaced.
2879-
This is the only method supported on MultiIndexes.
2880-
2881-
Returns:
2882-
DataFrame:
2883-
Returns the same object type as the caller, interpolated at
2884-
some or all ``NaN`` values
2885-
28862875
**Examples:**
28872876
28882877
>>> import bigframes.pandas as bpd
@@ -2891,17 +2880,41 @@ def interpolate(self, method: str = "linear"):
28912880
>>> df = bpd.DataFrame({
28922881
... 'A': [1, 2, 3, None, None, 6],
28932882
... 'B': [None, 6, None, 2, None, 3],
2894-
... })
2883+
... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0])
28952884
>>> df.interpolate()
2896-
A B
2897-
0 1.0 <NA>
2898-
1 2.0 6.0
2899-
2 3.0 4.0
2900-
3 4.0 2.0
2901-
4 5.0 2.5
2902-
5 6.0 3.0
2885+
A B
2886+
0.0 1.0 <NA>
2887+
0.1 2.0 6.0
2888+
0.3 3.0 4.0
2889+
0.7 4.0 2.0
2890+
0.9 5.0 2.5
2891+
1.0 6.0 3.0
29032892
<BLANKLINE>
29042893
[6 rows x 2 columns]
2894+
>>> df.interpolate(method="values")
2895+
A B
2896+
0.0 1.0 <NA>
2897+
0.1 2.0 6.0
2898+
0.3 3.0 4.666667
2899+
0.7 4.714286 2.0
2900+
0.9 5.571429 2.666667
2901+
1.0 6.0 3.0
2902+
<BLANKLINE>
2903+
[6 rows x 2 columns]
2904+
2905+
Args:
2906+
method (str, default 'linear'):
2907+
Interpolation technique to use. Only 'linear' supported.
2908+
'linear': Ignore the index and treat the values as equally spaced.
2909+
This is the only method supported on MultiIndexes.
2910+
'index', 'values': use the actual numerical values of the index.
2911+
'pad': Fill in NaNs using existing values.
2912+
'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d`
2913+
2914+
Returns:
2915+
DataFrame:
2916+
Returns the same object type as the caller, interpolated at
2917+
some or all ``NaN`` values
29052918
"""
29062919
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
29072920

0 commit comments

Comments
 (0)