Skip to content

Commit

Permalink
fix metadata comparison
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Aug 16, 2023
1 parent 76c538f commit a4911cc
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 4 deletions.
20 changes: 16 additions & 4 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,9 +581,22 @@ def _set_columns(self, new_columns):
if self.has_materialized_columns:
# do not set new columns if they're identical to the previous ones
if (
isinstance(new_columns, pandas.Index)
and self.columns.equals(new_columns)
) or np.array_equal(self.columns.values, new_columns):
# `Index.equals()` doesn't compare metadata, thus we have to compare
# it manually. Here we process the simpliest and the most common case only
# (when index the index is a 'pandas.Index' dtype). Other cases are not that
# common and we can omit them
type(new_columns) in (pandas.Index, pandas.MultiIndex)
and (
type(new_columns)
is type(self.columns) # noqa; here we need exact types comparison
)
and new_columns.name == self.columns.name
and new_columns.names == self.columns.names
and new_columns.equals(self.columns)
) or (
not isinstance(new_columns, pandas.Index)
and np.array_equal(self.columns.values, new_columns)
):
return
new_columns = self._validate_set_axis(new_columns, self._columns_cache)
if self.has_materialized_dtypes:
Expand Down Expand Up @@ -639,7 +652,6 @@ def _compute_axis_labels_and_lengths(self, axis: int, partitions=None):
List of int
Size of partitions alongside specified `axis`.
"""

if partitions is None:
partitions = self._partitions
new_index, internal_idx = self._partition_mgr_cls.get_indices(axis, partitions)
Expand Down
21 changes: 21 additions & 0 deletions modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,27 @@ def test_skip_set_columns():
# Verifies that the new columns weren't set if they're equal to the previous ones
assert not df._query_compiler._modin_frame._deferred_column

df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
df.columns = pandas.Index(["col1", "col2"], name="new name")
# Verifies that the new columns weren't set if they're equal to the previous ones
assert df.columns.name == "new name"

df = pd.DataFrame(
{("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]}
)
df.columns = df.columns.copy()
# Verifies that the new columns weren't set if they're equal to the previous ones
assert not df._query_compiler._modin_frame._deferred_column

df = pd.DataFrame(
{("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]}
)
new_cols = df.columns[::-1]
df.columns = new_cols
# Verifies that the new columns were successfully set in case they're actually new
assert df._query_compiler._modin_frame._deferred_column
assert df.columns.equals(new_cols)

df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
remove_axis_cache(df, axis=1)
df.columns = ["col1", "col2"]
Expand Down

0 comments on commit a4911cc

Please sign in to comment.