feat: Allow df.drop to take an index object (#68)

milkshakeiii · Henry J Solberg · tswast · web-flow · commit 740c45176f79 · 2023-10-04T16:17:20.000-07:00
* feat: Allow passing index objects to df.drop

* remove notebook files

* add first implementation for df.drop(index)

* use index_columns property

* don't use _expr.keys()

* fix order bug and add test

* fix index names

* support multiindex

* remove accidentally added files

* add type hint

* remove debug print statements

* fix: fix df/series.iloc by list with multiindex

* complete merge

* remove unneeded isinstance

* refactor _drop_by_index

---------

Co-authored-by: Henry J Solberg &lt;henryjsolberg@google.com&gt;
Co-authored-by: Tim Swast &lt;swast@google.com&gt;
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -937,7 +937,7 @@ def drop(
                 columns = labels
 
         block = self._block
-        if index:
+        if index is not None:
             level_id = self._resolve_levels(level or 0)[0]
 
             if utils.is_list_like(index):
@@ -947,6 +947,8 @@ def drop(
                 block, condition_id = block.apply_unary_op(
                     inverse_condition_id, ops.invert_op
                 )
+            elif isinstance(index, indexes.Index):
+                return self._drop_by_index(index)
             else:
                 block, condition_id = block.apply_unary_op(
                     level_id, ops.partial_right(ops.ne_op, index)
@@ -956,10 +958,31 @@ def drop(
             )
         if columns:
             block = block.drop_columns(self._sql_names(columns))
-        if not index and not columns:
+        if index is None and not columns:
             raise ValueError("Must specify 'labels' or 'index'/'columns")
         return DataFrame(block)
 
+    def _drop_by_index(self, index: indexes.Index) -> DataFrame:
+        block = index._data._get_block()
+        block, ordering_col = block.promote_offsets()
+        joined_index, (get_column_left, get_column_right) = self._block.index.join(
+            block.index
+        )
+
+        new_ordering_col = get_column_right(ordering_col)
+        drop_block = joined_index._block
+        drop_block, drop_col = drop_block.apply_unary_op(
+            new_ordering_col,
+            ops.isnull_op,
+        )
+
+        drop_block = drop_block.filter(drop_col)
+        original_columns = [
+            get_column_left(column) for column in self._block.value_columns
+        ]
+        drop_block = drop_block.select_columns(original_columns)
+        return DataFrame(drop_block)
+
     def droplevel(self, level: LevelsType, axis: int | str = 0):
         axis_n = utils.get_axis_number(axis)
         if axis_n == 0:
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -258,6 +258,61 @@ def test_drop_index(scalars_dfs):
     pd.testing.assert_frame_equal(pd_result, bf_result)
 
 
+def test_drop_pandas_index(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index
+
+    pd_result = scalars_pandas_df.drop(index=drop_index)
+    bf_result = scalars_df.drop(index=drop_index).to_pandas()
+
+    pd.testing.assert_frame_equal(pd_result, bf_result)
+
+
+def test_drop_bigframes_index(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    drop_index = scalars_df.loc[[4, 1, 2]].index
+    drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index
+
+    pd_result = scalars_pandas_df.drop(index=drop_pandas_index)
+    bf_result = scalars_df.drop(index=drop_index).to_pandas()
+
+    pd.testing.assert_frame_equal(pd_result, bf_result)
+
+
+def test_drop_bigframes_index_with_na(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    scalars_df = scalars_df.copy()
+    scalars_pandas_df = scalars_pandas_df.copy()
+    scalars_df = scalars_df.set_index("bytes_col")
+    scalars_pandas_df = scalars_pandas_df.set_index("bytes_col")
+    drop_index = scalars_df.iloc[[3, 5]].index
+    drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index
+
+    pd_result = scalars_pandas_df.drop(index=drop_pandas_index)  # drop_pandas_index)
+    bf_result = scalars_df.drop(index=drop_index).to_pandas()
+
+    pd.testing.assert_frame_equal(pd_result, bf_result)
+
+
+def test_drop_bigframes_multiindex(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    scalars_df = scalars_df.copy()
+    scalars_pandas_df = scalars_pandas_df.copy()
+    sub_df = scalars_df.iloc[[4, 1, 2]]
+    sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]]
+    sub_df = sub_df.set_index(["bytes_col", "numeric_col"])
+    sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"])
+    drop_index = sub_df.index
+    drop_pandas_index = sub_pandas_df.index
+
+    scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"])
+    scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"])
+    bf_result = scalars_df.drop(index=drop_index).to_pandas()
+    pd_result = scalars_pandas_df.drop(index=drop_pandas_index)
+
+    pd.testing.assert_frame_equal(pd_result, bf_result)
+
+
 def test_drop_labels_axis_0(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs