Skip to content

CoW: add warning mode for cases that will change behaviour #55428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 30, 2023
Merged
Next Next commit
CoW: add warning mode for cases that will change behaviour
  • Loading branch information
jorisvandenbossche committed Oct 6, 2023
commit d9aefbf9b75d7771dc79c53c2afbc921e45e0fb6
14 changes: 13 additions & 1 deletion pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"option_context",
"options",
"using_copy_on_write",
"warn_copy_on_write",
]
from pandas._config import config
from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401
Expand All @@ -32,7 +33,18 @@

def using_copy_on_write() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block"
return (
_mode_options["copy_on_write"] is True
and _mode_options["data_manager"] == "block"
)


def warn_copy_on_write() -> bool:
_mode_options = _global_config["mode"]
return (
_mode_options["copy_on_write"] == "warn"
and _mode_options["data_manager"] == "block"
)


def using_nullable_dtypes() -> bool:
Expand Down
13 changes: 12 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1994,7 +1994,18 @@ def using_copy_on_write() -> bool:
Fixture to check if Copy-on-Write is enabled.
"""
return (
pd.options.mode.copy_on_write
pd.options.mode.copy_on_write is True
and _get_option("mode.data_manager", silent=True) == "block"
)


@pytest.fixture
def warn_copy_on_write() -> bool:
"""
Fixture to check if Copy-on-Write is enabled.
"""
return (
pd.options.mode.copy_on_write == "warn"
and _get_option("mode.data_manager", silent=True) == "block"
)

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,9 +476,11 @@ def use_inf_as_na_cb(key) -> None:
"copy_on_write",
# Get the default from an environment variable, if set, otherwise defaults
# to False. This environment variable can be set for testing.
os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
"warn"
if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
copy_on_write_doc,
validator=is_bool,
validator=is_one_of_factory([True, False, "warn"]),
)


Expand Down
3 changes: 2 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import lib
Expand Down Expand Up @@ -4396,7 +4397,7 @@ def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
df.iloc[0:5]['group'] = 'a'

"""
if using_copy_on_write():
if using_copy_on_write() or warn_copy_on_write():
return

# return early if the check is not needed
Expand Down
19 changes: 15 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
internals as libinternals,
Expand Down Expand Up @@ -1988,9 +1991,17 @@ def setitem_inplace(self, indexer, value) -> None:
in place, not returning a new Manager (and Block), and thus never changing
the dtype.
"""
if using_copy_on_write() and not self._has_no_reference(0):
self.blocks = (self._block.copy(),)
self._cache.clear()
if not self._has_no_reference(0):
if using_copy_on_write():
self.blocks = (self._block.copy(),)
self._cache.clear()
elif warn_copy_on_write():
warnings.warn(
"Setting value on view: behaviour will change in pandas 3.0 "
"with Copy-on-Write ...",
FutureWarning,
stacklevel=find_stack_level(),
)

super().setitem_inplace(indexer, value)

Expand Down
68 changes: 47 additions & 21 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ def test_subset_row_slice(backend, using_copy_on_write):
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, dtype):
def test_subset_column_slice(
backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype
):
# Case: taking a subset of the columns of a DataFrame using a slice
# + afterwards modifying the subset
dtype_backend, DataFrame, _ = backend
Expand All @@ -159,10 +161,14 @@ def test_subset_column_slice(backend, using_copy_on_write, using_array_manager,

subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b"))

else:
# we only get a warning in case of a single block
warn = SettingWithCopyWarning if single_block else None
# TODO
warn = (
SettingWithCopyWarning
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused, shouldn't this be silent when CoW is enabled as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and that using_copy_on_Write case is already handled in the if block above, so in this else I still need to handle the warn_copy_on_write case.

I could actually do a elif warn_copy_on_write separate block, that might be easier to read and which is also the pattern I started to use in many other places.

if (single_block and not warn_copy_on_write)
else None
)
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
subset.iloc[0, 0] = 0
Expand Down Expand Up @@ -303,7 +309,9 @@ def test_subset_iloc_rows_columns(
[slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on_write):
def test_subset_set_with_row_indexer(
backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write
):
# Case: setting values with a row indexer on a viewing subset
# subset[indexer] = value and subset.iloc[indexer] = value
_, DataFrame, _ = backend
Expand All @@ -318,7 +326,8 @@ def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on
):
pytest.skip("setitem with labels selects on columns")

if using_copy_on_write:
# TODO
if using_copy_on_write or warn_copy_on_write:
indexer_si(subset)[indexer] = 0
else:
# INFO iloc no longer raises warning since pandas 1.4
Expand All @@ -340,7 +349,7 @@ def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on
tm.assert_frame_equal(df, df_orig)


def test_subset_set_with_mask(backend, using_copy_on_write):
def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write):
# Case: setting values with a mask on a viewing subset: subset[mask] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
Expand All @@ -349,7 +358,8 @@ def test_subset_set_with_mask(backend, using_copy_on_write):

mask = subset > 3

if using_copy_on_write:
# TODO
if using_copy_on_write or warn_copy_on_write:
subset[mask] = 0
else:
with pd.option_context("chained_assignment", "warn"):
Expand All @@ -370,7 +380,7 @@ def test_subset_set_with_mask(backend, using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


def test_subset_set_column(backend, using_copy_on_write):
def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write):
# Case: setting a single column on a viewing subset -> subset[col] = value
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
Expand All @@ -382,7 +392,8 @@ def test_subset_set_column(backend, using_copy_on_write):
else:
arr = pd.array([10, 11], dtype="Int64")

if using_copy_on_write:
# TODO
if using_copy_on_write or warn_copy_on_write:
subset["a"] = arr
else:
with pd.option_context("chained_assignment", "warn"):
Expand Down Expand Up @@ -472,7 +483,7 @@ def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_m
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_columns(backend, using_copy_on_write, dtype):
def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dtype):
# Case: setting multiple columns on a viewing subset
# -> subset[[col1, col2]] = value
dtype_backend, DataFrame, _ = backend
Expand All @@ -482,7 +493,8 @@ def test_subset_set_columns(backend, using_copy_on_write, dtype):
df_orig = df.copy()
subset = df[1:3]

if using_copy_on_write:
# TODO
if using_copy_on_write or warn_copy_on_write:
subset[["a", "c"]] = 0
else:
with pd.option_context("chained_assignment", "warn"):
Expand Down Expand Up @@ -879,7 +891,9 @@ def test_del_series(backend):
# Accessing column as Series


def test_column_as_series(backend, using_copy_on_write, using_array_manager):
def test_column_as_series(
backend, using_copy_on_write, warn_copy_on_write, using_array_manager
):
# Case: selecting a single column now also uses Copy-on-Write
dtype_backend, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
Expand All @@ -892,10 +906,14 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager):
if using_copy_on_write or using_array_manager:
s[0] = 0
else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
if warn_copy_on_write:
with tm.assert_produces_warning(FutureWarning):
s[0] = 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How important do we find it that we ensure the exact message? (given that this is only temporary, maybe not super important?)

(although I suppose it is not difficult to just add match="Setting a value on a view" to all cases I am adding in this PR.

I don't know how useful it will be to add a custom tm.assert_produces_cow_warning, but it would make the tm.assert_produces_warning(FutureWarning, match="Setting a value on a view"): a bit shorter.

But when adding warnings for more cases, the exact warning message might also differ.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's okay being lax with matching the warning message during testing (but noting a comment that it's CoW related if it's not obvious from the test)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but noting a comment that it's CoW related if it's not obvious from the test

That might be a reason to actually make something like a tm.assert_produces_cow_warning, then it's also clear without adding a comment everywhere, and will be easier to remove all of those.

else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
s[0] = 0

expected = Series([0, 2, 3], name="a")
tm.assert_series_equal(s, expected)
Expand All @@ -910,7 +928,7 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager):


def test_column_as_series_set_with_upcast(
backend, using_copy_on_write, using_array_manager
backend, using_copy_on_write, using_array_manager, warn_copy_on_write
):
# Case: selecting a single column now also uses Copy-on-Write -> when
# setting a value causes an upcast, we don't need to update the parent
Expand All @@ -921,10 +939,12 @@ def test_column_as_series_set_with_upcast(

s = df["a"]
if dtype_backend == "nullable":
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
warn = FutureWarning if warn_copy_on_write else None
with tm.assert_produces_warning(warn):
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
expected = Series([1, 2, 3], name="a")
elif using_copy_on_write or using_array_manager:
elif using_copy_on_write or warn_copy_on_write or using_array_manager:
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
s[0] = "foo"
expected = Series(["foo", 2, 3], dtype=object, name="a")
Expand Down Expand Up @@ -962,7 +982,12 @@ def test_column_as_series_set_with_upcast(
ids=["getitem", "loc", "iloc"],
)
def test_column_as_series_no_item_cache(
request, backend, method, using_copy_on_write, using_array_manager
request,
backend,
method,
using_copy_on_write,
warn_copy_on_write,
using_array_manager,
):
# Case: selecting a single column (which now also uses Copy-on-Write to protect
# the view) should always give a new object (i.e. not make use of a cache)
Expand All @@ -979,7 +1004,8 @@ def test_column_as_series_no_item_cache(
else:
assert s1 is s2

if using_copy_on_write or using_array_manager:
# TODO
if using_copy_on_write or warn_copy_on_write or using_array_manager:
s1.iloc[0] = 0
else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
Expand Down
25 changes: 20 additions & 5 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1651,7 +1651,7 @@ def test_isetitem_frame(using_copy_on_write):


@pytest.mark.parametrize("key", ["a", ["a"]])
def test_get(using_copy_on_write, key):
def test_get(using_copy_on_write, warn_copy_on_write, key):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()

Expand All @@ -1665,7 +1665,12 @@ def test_get(using_copy_on_write, key):
else:
# for non-CoW it depends on whether we got a Series or DataFrame if it
# is a view or copy or triggers a warning or not
warn = SettingWithCopyWarning if isinstance(key, list) else None
# TODO(CoW) should warn
warn = (
(None if warn_copy_on_write else SettingWithCopyWarning)
if isinstance(key, list)
else None
)
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
result.iloc[0] = 0
Expand All @@ -1680,7 +1685,9 @@ def test_get(using_copy_on_write, key):
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype):
def test_xs(
using_copy_on_write, warn_copy_on_write, using_array_manager, axis, key, dtype
):
single_block = (dtype == "int64") and not using_array_manager
is_view = single_block or (using_array_manager and axis == 1)
df = DataFrame(
Expand All @@ -1695,8 +1702,13 @@ def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype):
elif using_copy_on_write:
assert result._mgr._has_no_reference(0)

# TODO(CoW) should warn in case of is_view
if using_copy_on_write or is_view:
result.iloc[0] = 0
elif warn_copy_on_write:
warn = FutureWarning if single_block else None
with tm.assert_produces_warning(warn):
result.iloc[0] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
Expand All @@ -1710,7 +1722,9 @@ def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype):

@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)])
def test_xs_multiindex(using_copy_on_write, using_array_manager, key, level, axis):
def test_xs_multiindex(
using_copy_on_write, warn_copy_on_write, using_array_manager, key, level, axis
):
arr = np.arange(18).reshape(6, 3)
index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"])
df = DataFrame(arr, index=index, columns=list("abc"))
Expand All @@ -1725,8 +1739,9 @@ def test_xs_multiindex(using_copy_on_write, using_array_manager, key, level, axi
get_array(df, df.columns[0]), get_array(result, result.columns[0])
)

# TODO(CoW) should warn
warn = (
SettingWithCopyWarning
(None if warn_copy_on_write else SettingWithCopyWarning)
if not using_copy_on_write and not using_array_manager
else None
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,6 @@ def using_copy_on_write() -> bool:
Fixture to check if Copy-on-Write is enabled.
"""
return (
options.mode.copy_on_write
options.mode.copy_on_write is True
and _get_option("mode.data_manager", silent=True) == "block"
)
Loading