Skip to content

Commit e8bab68

Browse files
authored
feat: add dataframe.insert (#770)
* feat: add dataframe.insert * update logic. * fix
1 parent b7b134e commit e8bab68

File tree

3 files changed

+111
-0
lines changed

3 files changed

+111
-0
lines changed

bigframes/dataframe.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,34 @@ def nsmallest(
13051305
column_ids = self._sql_names(columns)
13061306
return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep))
13071307

1308+
def insert(
1309+
self,
1310+
loc: int,
1311+
column: blocks.Label,
1312+
value: SingleItemValue,
1313+
allow_duplicates: bool = False,
1314+
):
1315+
column_count = len(self.columns)
1316+
if loc > column_count:
1317+
raise IndexError(
1318+
f"Column index {loc} is out of bounds with {column_count} total columns."
1319+
)
1320+
if (column in self.columns) and not allow_duplicates:
1321+
raise ValueError(f"cannot insert {column}, already exists")
1322+
1323+
temp_column = bigframes.core.guid.generate_guid(prefix=str(column))
1324+
df = self._assign_single_item(temp_column, value)
1325+
1326+
block = df._get_block()
1327+
value_columns = typing.cast(List, block.value_columns)
1328+
value_columns, new_column = value_columns[:-1], value_columns[-1]
1329+
value_columns.insert(loc, new_column)
1330+
1331+
block = block.select_columns(value_columns)
1332+
block = block.rename(columns={temp_column: column})
1333+
1334+
self._set_block(block)
1335+
13081336
def drop(
13091337
self,
13101338
labels: typing.Any = None,

tests/system/small/test_dataframe.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,44 @@ def test_get_columns_default(scalars_dfs):
270270
assert result == "default_val"
271271

272272

273+
@pytest.mark.parametrize(
274+
("loc", "column", "value", "allow_duplicates"),
275+
[
276+
(0, 666, 2, False),
277+
(5, "float64_col", 2.2, True),
278+
(13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True),
279+
pytest.param(
280+
14,
281+
"test",
282+
2,
283+
False,
284+
marks=pytest.mark.xfail(
285+
raises=IndexError,
286+
),
287+
),
288+
pytest.param(
289+
12,
290+
"int64_col",
291+
2,
292+
False,
293+
marks=pytest.mark.xfail(
294+
raises=ValueError,
295+
),
296+
),
297+
],
298+
)
299+
def test_insert(scalars_dfs, loc, column, value, allow_duplicates):
300+
scalars_df, scalars_pandas_df = scalars_dfs
301+
# insert works inplace, so will influence other tests.
302+
# make a copy to avoid inplace changes.
303+
bf_df = scalars_df.copy()
304+
pd_df = scalars_pandas_df.copy()
305+
bf_df.insert(loc, column, value, allow_duplicates)
306+
pd_df.insert(loc, column, value, allow_duplicates)
307+
308+
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False)
309+
310+
273311
def test_drop_column(scalars_dfs):
274312
scalars_df, scalars_pandas_df = scalars_dfs
275313
col_name = "int64_col"

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1067,6 +1067,51 @@ def reindex_like(self, other):
10671067
"""
10681068
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
10691069

1070+
def insert(self, loc, column, value, allow_duplicates=False):
1071+
"""Insert column into DataFrame at specified location.
1072+
1073+
Raises a ValueError if `column` is already contained in the DataFrame,
1074+
unless `allow_duplicates` is set to True.
1075+
1076+
**Examples:**
1077+
1078+
>>> import bigframes.pandas as bpd
1079+
>>> bpd.options.display.progress_bar = None
1080+
1081+
>>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1082+
1083+
Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5.
1084+
1085+
>>> df.insert(1, 'col3', 5)
1086+
>>> df
1087+
col1 col3 col2
1088+
0 1 5 3
1089+
1 2 5 4
1090+
<BLANKLINE>
1091+
[2 rows x 3 columns]
1092+
1093+
Insert another column named 'col2' at the beginning of the DataFrame with values [5, 6]
1094+
1095+
>>> df.insert(0, 'col2', [5, 6], allow_duplicates=True)
1096+
>>> df
1097+
col2 col1 col3 col2
1098+
0 5 1 5 3
1099+
1 6 2 5 4
1100+
<BLANKLINE>
1101+
[2 rows x 4 columns]
1102+
1103+
Args:
1104+
loc (int):
1105+
Insertion index. Must verify 0 <= loc <= len(columns).
1106+
column (str, number, or hashable object):
1107+
Label of the inserted column.
1108+
value (Scalar, Series, or array-like):
1109+
Content of the inserted column.
1110+
allow_duplicates (bool, default False):
1111+
Allow duplicate column labels to be created.
1112+
"""
1113+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1114+
10701115
def drop(
10711116
self, labels=None, *, axis=0, index=None, columns=None, level=None
10721117
) -> DataFrame | None:

0 commit comments

Comments
 (0)