diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1aba48371b430..9b435aa31bd16 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1758,9 +1758,14 @@ def fillna( downcast=downcast, using_cow=using_cow, ) - new_values = self.values.fillna(value=value, method=None, limit=limit) - nb = self.make_block_same_class(new_values) - return nb._maybe_downcast([nb], downcast) + if using_cow and self._can_hold_na and not self.values._hasna: + refs = self.refs + new_values = self.values + else: + refs = None + new_values = self.values.fillna(value=value, method=None, limit=limit) + nb = self.make_block_same_class(new_values, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow=using_cow) @cache_readonly def shape(self) -> Shape: diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index c3e1e5f39384b..35bd5d47b36dc 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + NA, DataFrame, Interval, NaT, @@ -271,3 +272,48 @@ def test_fillna_series_empty_arg_inplace(using_copy_on_write): assert np.shares_memory(get_array(ser), arr) if using_copy_on_write: assert ser._mgr._has_no_reference(0) + + +def test_fillna_ea_noop_shares_memory( + using_copy_on_write, any_numeric_ea_and_arrow_dtype +): + df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) + df_orig = df.copy() + df2 = df.fillna(100) + + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not df2._mgr._has_no_reference(1) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + tm.assert_frame_equal(df_orig, df) + + df2.iloc[0, 1] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert df2._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(1) + tm.assert_frame_equal(df_orig, df) + + +def test_fillna_inplace_ea_noop_shares_memory( + using_copy_on_write, any_numeric_ea_and_arrow_dtype +): + df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) + df_orig = df.copy() + view = df[:] + df.fillna(100, inplace=True) + + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not df._mgr._has_no_reference(1) + assert not view._mgr._has_no_reference(1) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(view, "b")) + df.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, view) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 85c2febefb6ce..ca867ffb77296 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -243,24 +243,25 @@ def test_factorize_empty(self, data): def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) + df_orig = df.copy() filled_val = df.iloc[0, 0] result = df.fillna(filled_val) - assert df.A.values is not result.A.values + result.iloc[0, 0] = filled_val - def test_fillna_copy_series(self, data_missing, no_op_with_cow: bool = False): + self.assert_frame_equal(df, df_orig) + + def test_fillna_copy_series(self, data_missing): arr = data_missing.take([1, 1]) ser = pd.Series(arr) + ser_orig = ser.copy() filled_val = ser[0] result = ser.fillna(filled_val) + result.iloc[0] = filled_val - if no_op_with_cow: - assert ser._values is result._values - else: - assert ser._values is not result._values - assert ser._values is arr + self.assert_series_equal(ser, ser_orig) def test_fillna_length_mismatch(self, data_missing): msg = "Length of 'value' does not match." diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 3827ba234cfd8..0d14128e3bebf 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -2,7 +2,10 @@ import pytest -from pandas import Series +from pandas import ( + Series, + options, +) @pytest.fixture @@ -193,3 +196,11 @@ def invalid_scalar(data): If the array can hold any item (i.e. object dtype), then use pytest.skip. """ return object.__new__(object) + + +@pytest.fixture +def using_copy_on_write() -> bool: + """ + Fixture to check if Copy-on-Write is enabled. + """ + return options.mode.copy_on_write and options.mode.data_manager == "block" diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index d7b9874555046..37a7d78d3aa3d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -301,17 +301,9 @@ def test_searchsorted(self, data_for_sorting): def test_equals(self, data, na_value, as_series): super().test_equals(data, na_value, as_series) - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - arr = data_missing.take([1, 1]) - df = pd.DataFrame({"A": arr}) - - filled_val = df.iloc[0, 0] - result = df.fillna(filled_val) - - if using_copy_on_write: - assert df.A.values is result.A.values - else: - assert df.A.values is not result.A.values + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) class TestCasting(BaseJSON, base.BaseCastingTests): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 0ad2de9e834fa..92796c604333d 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -116,11 +116,6 @@ def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - super().test_fillna_copy_series( - data_missing, no_op_with_cow=using_copy_on_write - ) - class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 3cf24d848e453..0f916cea9d518 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -132,11 +132,6 @@ def test_combine_add(self, data_repeated): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - super().test_fillna_copy_series( - data_missing, no_op_with_cow=using_copy_on_write - ) - class TestMissing(BaseInterval, base.BaseMissingTests): # Index.fillna only accepts scalar `value`, so we have to xfail all diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 1ecd279e1f34b..cb1ebd87875e1 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -105,11 +105,6 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - super().test_fillna_copy_series( - data_missing, no_op_with_cow=using_copy_on_write - ) - class TestInterface(BasePeriodTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 745911871694c..2aeb2af567ea0 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -272,7 +272,7 @@ def test_fillna_frame(self, data_missing): class TestMethods(BaseSparseTests, base.BaseMethodsTests): _combine_le_expected_dtype = "Sparse[bool]" - def test_fillna_copy_frame(self, data_missing): + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}, copy=False) @@ -280,17 +280,24 @@ def test_fillna_copy_frame(self, data_missing): result = df.fillna(filled_val) if hasattr(df._mgr, "blocks"): - assert df.values.base is not result.values.base + if using_copy_on_write: + assert df.values.base is result.values.base + else: + assert df.values.base is not result.values.base assert df.A._values.to_dense() is arr.to_dense() - def test_fillna_copy_series(self, data_missing): + def test_fillna_copy_series(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) ser = pd.Series(arr) filled_val = ser[0] result = ser.fillna(filled_val) - assert ser._values is not result._values + if using_copy_on_write: + assert ser._values is result._values + + else: + assert ser._values is not result._values assert ser._values.to_dense() is arr.to_dense() @pytest.mark.xfail(reason="Not Applicable") diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 76ba4c974b3fd..8851b7669932d 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -303,7 +303,9 @@ def test_groupby_raises_datetime_np(how, by, groupby_series, groupby_func_np): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) -def test_groupby_raises_category(how, by, groupby_series, groupby_func): +def test_groupby_raises_category( + how, by, groupby_series, groupby_func, using_copy_on_write +): # GH#50749 df = DataFrame( { @@ -370,7 +372,9 @@ def test_groupby_raises_category(how, by, groupby_series, groupby_func): TypeError, r"Cannot setitem on a Categorical with a new category \(0\), " + "set the categories first", - ), + ) + if not using_copy_on_write + else (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -491,7 +495,7 @@ def test_groupby_raises_category_np(how, by, groupby_series, groupby_func_np): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( - how, by, groupby_series, groupby_func, observed + how, by, groupby_series, groupby_func, observed, using_copy_on_write ): # GH#50749 df = DataFrame( @@ -562,7 +566,9 @@ def test_groupby_raises_category_on_category( TypeError, r"Cannot setitem on a Categorical with a new category \(0\), " + "set the categories first", - ), + ) + if not using_copy_on_write + else (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (ValueError, "attempt to get argmax of an empty sequence") if empty_groups