FIX-#2362: fix handling slices in 'DataFrame.__setitem__' (#2741)

dchigarev · web-flow · commit 1f3b514cbc01 · 2021-02-18T07:53:10.000-06:00
Signed-off-by: Dmitry Chigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -2708,7 +2708,21 @@ def __getitem__(self, key):
         else:
             return self._getitem(key)
 
-    def _getitem_slice(self, key):
+    def _setitem_slice(self, key: slice, value):
+        """
+        Set rows specified by 'key' slice with 'value'.
+
+        Parameters
+        ----------
+        key: location or index based slice,
+            Key that points rows to modify.
+        value: any,
+            Value to assing to the rows.
+        """
+        indexer = convert_to_index_sliceable(pandas.DataFrame(index=self.index), key)
+        self.iloc[indexer] = value
+
+    def _getitem_slice(self, key: slice):
         if key.start is None and key.stop is None:
             return self.copy()
         return self.iloc[key]
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -2008,6 +2008,9 @@ def __setattr__(self, key, value):
         object.__setattr__(self, key, value)
 
     def __setitem__(self, key, value):
+        if isinstance(key, slice):
+            return self._setitem_slice(key, value)
+
         if hashable(key) and key not in self.columns:
             if isinstance(value, Series) and len(self.columns) == 0:
                 self._query_compiler = value._query_compiler.copy()
@@ -2038,24 +2041,23 @@ def __setitem__(self, key, value):
             self.insert(loc=len(self.columns), column=key, value=value)
             return
 
-        if not isinstance(key, str):
-
+        if not hashable(key):
             if isinstance(key, DataFrame) or isinstance(key, np.ndarray):
                 if isinstance(key, np.ndarray):
                     if key.shape != self.shape:
                         raise ValueError("Array must be same shape as DataFrame")
                     key = DataFrame(key, columns=self.columns)
                 return self.mask(key, value, inplace=True)
 
-            def setitem_without_string_columns(df):
+            def setitem_unhashable_key(df):
                 # Arrow makes memory-mapped objects immutable, so copy will allow them
                 # to be mutable again.
                 df = df.copy(True)
                 df[key] = value
                 return df
 
             return self._update_inplace(
-                self._default_to_pandas(setitem_without_string_columns)._query_compiler
+                self._default_to_pandas(setitem_unhashable_key)._query_compiler
             )
         if is_list_like(value):
             if isinstance(value, (pandas.DataFrame, DataFrame)):
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -285,14 +285,8 @@ def __round__(self, decimals=0):
         )
 
     def __setitem__(self, key, value):
-        if isinstance(key, slice) and (
-            isinstance(key.start, int) or isinstance(key.stop, int)
-        ):
-            # There could be two type of slices:
-            #   - Location based slice (1:5)
-            #   - Labels based slice ("a":"e")
-            # For location based slice we're going to `iloc`, since `loc` can't manage it.
-            self.iloc[key] = value
+        if isinstance(key, slice):
+            self._setitem_slice(key, value)
         else:
             self.loc[key] = value
 
diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
@@ -1215,19 +1215,16 @@ def test___setitem__(data):
         df_equals(modin_df, pandas_df)
         assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0]))
 
+    modin_df[1:5] = 10
+    pandas_df[1:5] = 10
+    df_equals(modin_df, pandas_df)
+
     # Transpose test
     modin_df = pd.DataFrame(data).T
     pandas_df = pandas.DataFrame(data).T
 
-    # We default to pandas on non-string column names
-    if not all(isinstance(c, str) for c in modin_df.columns):
-        with pytest.warns(UserWarning):
-            modin_df[modin_df.columns[0]] = 0
-    else:
-        modin_df[modin_df.columns[0]] = 0
-
+    modin_df[modin_df.columns[0]] = 0
     pandas_df[pandas_df.columns[0]] = 0
-
     df_equals(modin_df, pandas_df)
 
     modin_df.columns = [str(i) for i in modin_df.columns]
@@ -1240,7 +1237,10 @@ def test___setitem__(data):
 
     modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345
     pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345
+    df_equals(modin_df, pandas_df)
 
+    modin_df[1:5] = 10
+    pandas_df[1:5] = 10
     df_equals(modin_df, pandas_df)
 
 
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -529,14 +529,31 @@ def test___setitem__(data):
 @pytest.mark.parametrize(
     "key",
     [
-        pytest.param(slice(1, 3), id="numeric_slice"),
-        pytest.param(slice("a", "c"), id="index_based_slice"),
-        pytest.param(["a", "c", "e"], id="list_of_labels"),
-        pytest.param([True, False, True, False, True], id="boolean_mask"),
+        pytest.param(lambda idx: slice(1, 3), id="location_based_slice"),
+        pytest.param(lambda idx: slice(idx[1], idx[-1]), id="index_based_slice"),
+        pytest.param(lambda idx: [idx[0], idx[2], idx[-1]], id="list_of_labels"),
+        pytest.param(
+            lambda idx: [True if i % 2 else False for i in range(len(idx))],
+            id="boolean_mask",
+        ),
     ],
 )
-def test___setitem___non_hashable(key):
-    md_sr, pd_sr = create_test_series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
+@pytest.mark.parametrize(
+    "index",
+    [
+        pytest.param(
+            lambda idx_len: [chr(x) for x in range(ord("a"), ord("a") + idx_len)],
+            id="str_index",
+        ),
+        pytest.param(lambda idx_len: list(range(1, idx_len + 1)), id="int_index"),
+    ],
+)
+def test___setitem___non_hashable(key, index):
+    data = np.arange(5)
+    index = index(len(data))
+    key = key(index)
+    md_sr, pd_sr = create_test_series(data, index=index)
+
     md_sr[key] = 10
     pd_sr[key] = 10
     df_equals(md_sr, pd_sr)