feat: add dataframe.insert (#770)

Genesis929 · web-flow · commit e8bab681a2d0 · 2024-06-11T10:36:58.000-07:00
* feat: add dataframe.insert

* update logic.

* fix
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1305,6 +1305,34 @@ def nsmallest(
         column_ids = self._sql_names(columns)
         return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep))
 
+    def insert(
+        self,
+        loc: int,
+        column: blocks.Label,
+        value: SingleItemValue,
+        allow_duplicates: bool = False,
+    ):
+        column_count = len(self.columns)
+        if loc > column_count:
+            raise IndexError(
+                f"Column index {loc} is out of bounds with {column_count} total columns."
+            )
+        if (column in self.columns) and not allow_duplicates:
+            raise ValueError(f"cannot insert {column}, already exists")
+
+        temp_column = bigframes.core.guid.generate_guid(prefix=str(column))
+        df = self._assign_single_item(temp_column, value)
+
+        block = df._get_block()
+        value_columns = typing.cast(List, block.value_columns)
+        value_columns, new_column = value_columns[:-1], value_columns[-1]
+        value_columns.insert(loc, new_column)
+
+        block = block.select_columns(value_columns)
+        block = block.rename(columns={temp_column: column})
+
+        self._set_block(block)
+
     def drop(
         self,
         labels: typing.Any = None,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -270,6 +270,44 @@ def test_get_columns_default(scalars_dfs):
     assert result == "default_val"
 
 
+@pytest.mark.parametrize(
+    ("loc", "column", "value", "allow_duplicates"),
+    [
+        (0, 666, 2, False),
+        (5, "float64_col", 2.2, True),
+        (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True),
+        pytest.param(
+            14,
+            "test",
+            2,
+            False,
+            marks=pytest.mark.xfail(
+                raises=IndexError,
+            ),
+        ),
+        pytest.param(
+            12,
+            "int64_col",
+            2,
+            False,
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+            ),
+        ),
+    ],
+)
+def test_insert(scalars_dfs, loc, column, value, allow_duplicates):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    # insert works inplace, so will influence other tests.
+    # make a copy to avoid inplace changes.
+    bf_df = scalars_df.copy()
+    pd_df = scalars_pandas_df.copy()
+    bf_df.insert(loc, column, value, allow_duplicates)
+    pd_df.insert(loc, column, value, allow_duplicates)
+
+    pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False)
+
+
 def test_drop_column(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_col"
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1067,6 +1067,51 @@ def reindex_like(self, other):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def insert(self, loc, column, value, allow_duplicates=False):
+        """Insert column into DataFrame at specified location.
+
+        Raises a ValueError if `column` is already contained in the DataFrame,
+        unless `allow_duplicates` is set to True.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+
+        Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5.
+
+            >>> df.insert(1, 'col3', 5)
+            >>> df
+               col1  col3  col2
+            0     1     5     3
+            1     2     5     4
+            <BLANKLINE>
+            [2 rows x 3 columns]
+
+        Insert another column named 'col2' at the beginning of the DataFrame with values [5, 6]
+
+            >>> df.insert(0, 'col2', [5, 6], allow_duplicates=True)
+            >>> df
+               col2  col1  col3  col2
+            0     5     1     5     3
+            1     6     2     5     4
+            <BLANKLINE>
+            [2 rows x 4 columns]
+
+        Args:
+            loc (int):
+                Insertion index. Must verify 0 <= loc <= len(columns).
+            column (str, number, or hashable object):
+                Label of the inserted column.
+            value (Scalar, Series, or array-like):
+                Content of the inserted column.
+            allow_duplicates (bool, default False):
+                Allow duplicate column labels to be created.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def drop(
         self, labels=None, *, axis=0, index=None, columns=None, level=None
     ) -> DataFrame | None: