Skip to content

Commit ad2d9d3

Browse files
feat: add interpolate() to series and dataframe
1 parent 79a638e commit ad2d9d3

File tree

7 files changed

+159
-0
lines changed

7 files changed

+159
-0
lines changed

bigframes/core/block_transforms.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,93 @@ def indicate_duplicates(
105105
)
106106

107107

108+
def interpolate_linear(block: blocks.Block) -> blocks.Block:
109+
backwards_window = windows.WindowSpec(following=0)
110+
forwards_window = windows.WindowSpec(preceding=0)
111+
112+
output_column_ids = []
113+
114+
original_columns = block.value_columns
115+
original_labels = block.column_labels
116+
block, offsets = block.promote_offsets()
117+
for column in original_columns:
118+
# null in same places column is null
119+
should_interpolate = block._column_type(column) in [
120+
pd.Float64Dtype(),
121+
pd.Int64Dtype(),
122+
]
123+
if should_interpolate:
124+
block, notnull = block.apply_unary_op(column, ops.notnull_op)
125+
block, masked_offsets = block.apply_binary_op(
126+
offsets, notnull, ops.partial_arg3(ops.where_op, None)
127+
)
128+
129+
block, previous_value = block.apply_window_op(
130+
column, agg_ops.LastNonNullOp(), backwards_window
131+
)
132+
block, next_value = block.apply_window_op(
133+
column, agg_ops.FirstNonNullOp(), forwards_window
134+
)
135+
block, previous_value_offset = block.apply_window_op(
136+
masked_offsets,
137+
agg_ops.LastNonNullOp(),
138+
backwards_window,
139+
skip_reproject_unsafe=True,
140+
)
141+
block, next_value_offset = block.apply_window_op(
142+
masked_offsets,
143+
agg_ops.FirstNonNullOp(),
144+
forwards_window,
145+
skip_reproject_unsafe=True,
146+
)
147+
148+
block, prediction_id = _interpolate(
149+
block,
150+
previous_value_offset,
151+
previous_value,
152+
next_value_offset,
153+
next_value,
154+
offsets,
155+
)
156+
157+
block, interpolated_column = block.apply_binary_op(
158+
column, prediction_id, ops.fillna_op
159+
)
160+
# Pandas performs ffill-like behavior to extrapolate forwards
161+
block, interpolated_and_ffilled = block.apply_binary_op(
162+
interpolated_column, previous_value, ops.fillna_op
163+
)
164+
165+
output_column_ids.append(interpolated_and_ffilled)
166+
else:
167+
output_column_ids.append(column)
168+
169+
# Force reproject since used `skip_project_unsafe` perviously
170+
block = block.select_columns(output_column_ids)._force_reproject()
171+
return block.with_column_labels(original_labels)
172+
173+
174+
def _interpolate(
175+
block: blocks.Block,
176+
x0_id: str,
177+
y0_id: str,
178+
x1_id: str,
179+
y1_id: str,
180+
xpredict_id: str,
181+
) -> typing.Tuple[blocks.Block, str]:
182+
"""Applies linear interpolation equation to predict y values for xpredict."""
183+
block, x1x0diff = block.apply_binary_op(x1_id, x0_id, ops.sub_op)
184+
block, y1y0diff = block.apply_binary_op(y1_id, y0_id, ops.sub_op)
185+
block, xpredictx0diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op)
186+
187+
block, y1_weight = block.apply_binary_op(y1y0diff, x1x0diff, ops.div_op)
188+
block, y1_part = block.apply_binary_op(xpredictx0diff, y1_weight, ops.mul_op)
189+
190+
block, prediction_id = block.apply_binary_op(y0_id, y1_part, ops.add_op)
191+
block = block.drop_columns([x1x0diff, y1y0diff, xpredictx0diff, y1_weight, y1_part])
192+
return block, prediction_id
193+
194+
108195
def drop_duplicates(
109196
block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
110197
) -> blocks.Block:

bigframes/dataframe.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1434,6 +1434,10 @@ def _reindex_columns(self, columns):
14341434
def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None):
14351435
return self.reindex(index=other.index, columns=other.columns, validate=validate)
14361436

1437+
def interpolate(self) -> DataFrame:
1438+
result = block_ops.interpolate_linear(self._block)
1439+
return DataFrame(result)
1440+
14371441
def fillna(self, value=None) -> DataFrame:
14381442
return self._apply_binop(value, ops.fillna_op, how="left")
14391443

bigframes/series.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,10 @@ def replace(
468468
)
469469
return Series(block.select_column(result_col))
470470

471+
def interpolate(self) -> Series:
472+
result = block_ops.interpolate_linear(self._block)
473+
return Series(result)
474+
471475
def dropna(
472476
self,
473477
*,

tests/system/small/test_dataframe.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,22 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index):
711711
pandas.testing.assert_frame_equal(bf_result, pd_result)
712712

713713

714+
def test_df_interpolate(scalars_dfs):
715+
scalars_df, scalars_pandas_df = scalars_dfs
716+
columns = ["int64_col", "int64_too", "float64_col"]
717+
bf_result = scalars_df[columns].interpolate().to_pandas()
718+
# Pandas can only interpolate on "float64" columns
719+
# https://github.com/pandas-dev/pandas/issues/40252
720+
pd_result = scalars_pandas_df[columns].astype("float64").interpolate()
721+
722+
pandas.testing.assert_frame_equal(
723+
bf_result,
724+
pd_result,
725+
check_index_type=False,
726+
check_dtype=False,
727+
)
728+
729+
714730
def test_df_fillna(scalars_dfs):
715731
scalars_df, scalars_pandas_df = scalars_dfs
716732
df = scalars_df[["int64_col", "float64_col"]].fillna(3)

tests/system/small/test_series.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,32 @@ def test_series_replace_list_scalar(scalars_dfs):
272272
)
273273

274274

275+
@pytest.mark.parametrize(
276+
("values",),
277+
(
278+
([None, 1, 2, None, None, 16, None],),
279+
([None, None, 3.6, None],),
280+
([403.2, None, 352.1, None, None, 111.9],),
281+
),
282+
)
283+
def test_series_interpolate(values):
284+
pd_series = pd.Series(values)
285+
bf_series = series.Series(pd_series)
286+
287+
# Pandas can only interpolate on "float64" columns
288+
# https://github.com/pandas-dev/pandas/issues/40252
289+
pd_result = pd_series.astype("float64").interpolate()
290+
bf_result = bf_series.interpolate().to_pandas()
291+
292+
# pd uses non-null types, while bf uses nullable types
293+
pd.testing.assert_series_equal(
294+
pd_result,
295+
bf_result,
296+
check_index_type=False,
297+
check_dtype=False,
298+
)
299+
300+
275301
@pytest.mark.parametrize(
276302
("ignore_index",),
277303
(

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2756,6 +2756,17 @@ def value_counts(
27562756
"""
27572757
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
27582758

2759+
def interpolate(self):
2760+
"""
2761+
Fill NaN values using an interpolation method.
2762+
2763+
Returns:
2764+
DataFrame:
2765+
Returns the same object type as the caller, interpolated at
2766+
some or all ``NaN`` values
2767+
"""
2768+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
2769+
27592770
def fillna(self, value):
27602771
"""
27612772
Fill NA/NaN values using the specified method.

third_party/bigframes_vendored/pandas/core/series.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,17 @@ def droplevel(self, level, axis):
916916
"""
917917
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
918918

919+
def interpolate(self):
920+
"""
921+
Fill NaN values using an interpolation method.
922+
923+
Returns:
924+
Series:
925+
Returns the same object type as the caller, interpolated at
926+
some or all ``NaN`` values
927+
"""
928+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
929+
919930
def fillna(
920931
self,
921932
value=None,

0 commit comments

Comments
 (0)