Skip to content

Proof of concept for Copy-on-Write implementation #41878

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
462526e
POC for Copy-on-Write
jorisvandenbossche Apr 29, 2021
41ee2b7
clean-up ArrayManager.copy implementation
jorisvandenbossche Jun 8, 2021
7a8dffc
add some comments / docstrings
jorisvandenbossche Jun 8, 2021
1c964be
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Jun 25, 2021
96b6d71
fix series slice + del operator
jorisvandenbossche Jun 25, 2021
17cedb9
fix pickle and column_setitem with dtype changes
jorisvandenbossche Jun 25, 2021
f4614c2
fix setitem on single column for full slice + update frame tests
jorisvandenbossche Jun 25, 2021
81b09c2
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Jun 26, 2021
7f183de
fix ref tracking in slicing columns
jorisvandenbossche Jun 26, 2021
693bc4f
fix series setitem -> don't update parent cache
jorisvandenbossche Jun 28, 2021
a154591
update indexing tests with new behaviour to get them passing
jorisvandenbossche Jun 28, 2021
71370c4
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Jul 12, 2021
4e785d7
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Aug 4, 2021
6741340
fix new test + skip Series.update tests for now
jorisvandenbossche Aug 4, 2021
c4527e9
remove chained assignment outside indexing tests
jorisvandenbossche Aug 4, 2021
b33aaf2
fix DataFrame.fillna + more chained setitem
jorisvandenbossche Aug 4, 2021
9fdad69
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Aug 7, 2021
f5da03f
reindex/take on columns to use copy-on-write
jorisvandenbossche Aug 8, 2021
c632757
fix some typing issues
jorisvandenbossche Aug 8, 2021
e4a5f33
fix dtype in test for windows
jorisvandenbossche Aug 8, 2021
5efad50
fix dtype in test for windows
jorisvandenbossche Aug 8, 2021
8ea48cc
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Aug 9, 2021
79b7a30
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Oct 11, 2021
cf5c7e2
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Nov 11, 2021
297662a
[ArrayManager] Array version of putmask logic
jorisvandenbossche Nov 11, 2021
a011a06
fixup copy-on-write in putmask
jorisvandenbossche Nov 11, 2021
cc09001
Update BlockManager expected results after recent behaviour changes
jorisvandenbossche Nov 11, 2021
37e7ce0
limit changes to putmask for this PR
jorisvandenbossche Nov 22, 2021
e34b9b2
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Nov 22, 2021
64377e0
Merge remote-tracking branch 'upstream/master' into am-experiment-cow
jorisvandenbossche Nov 26, 2021
0f28095
address feedback
jorisvandenbossche Nov 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update indexing tests with new behaviour to get them passing
  • Loading branch information
jorisvandenbossche committed Jun 28, 2021
commit a154591e6976729724c2ac1e26d269803ceff97f
1 change: 1 addition & 0 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ def where(self: T, other, cond, align: bool, errors: str) -> T:
# return self.apply_with_block("setitem", indexer=indexer, value=value)

def putmask(self, mask, new, align: bool = True):
# TODO(CoW) check references to copy if needed
if align:
align_keys = ["new", "mask"]
else:
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/indexes/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,16 +255,20 @@ def test_is_(self):
assert not index.is_(index - 2)
assert not index.is_(index - 0)

def test_index_duplicate_periods(self):
def test_index_duplicate_periods(self, using_array_manager):
# monotonic
idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN")
ts = Series(np.random.randn(len(idx)), index=idx)
original = ts.copy()

result = ts["2007"]
expected = ts[1:3]
tm.assert_series_equal(result, expected)
result[:] = 1
assert (ts[1:3] == 1).all()
if using_array_manager:
tm.assert_series_equal(ts, original)
else:
assert (ts[1:3] == 1).all()

# not monotonic
idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas.core.common as com


@td.skip_array_manager_not_yet_implemented
def test_detect_chained_assignment():
# Inplace ops, originally from:
# https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug
Expand Down
38 changes: 28 additions & 10 deletions pandas/tests/indexing/multiindex/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,16 +390,24 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data):
reindexed = dft.reindex(columns=[("foo", "two")])
tm.assert_series_equal(reindexed["foo", "two"], s > s.median())

def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data):
def test_set_column_scalar_with_loc(
self, multiindex_dataframe_random_data, using_array_manager
):
frame = multiindex_dataframe_random_data
subset = frame.index[[1, 4, 5]]

frame.loc[subset] = 99
assert (frame.loc[subset].values == 99).all()

frame_original = frame.copy()

col = frame["B"]
col[subset] = 97
assert (frame.loc[subset, "B"] == 97).all()
if using_array_manager:
# chained setitem doesn't work with CoW
tm.assert_frame_equal(frame, frame_original)
else:
assert (frame.loc[subset, "B"] == 97).all()

def test_nonunique_assignment_1750(self):
df = DataFrame(
Expand Down Expand Up @@ -472,21 +480,31 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data):
assert (df["foo"].values == 0).all()


def test_frame_setitem_copy_raises(multiindex_dataframe_random_data):
def test_frame_setitem_copy_raises(
multiindex_dataframe_random_data, using_array_manager
):
# will raise/warn as its chained assignment
df = multiindex_dataframe_random_data.T
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
if using_array_manager:
# TODO(CoW) it would be nice if this could still warn/raise
df["foo"]["one"] = 2
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
df["foo"]["one"] = 2


def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data):
def test_frame_setitem_copy_no_write(
multiindex_dataframe_random_data, using_array_manager
):
frame = multiindex_dataframe_random_data.T
expected = frame
df = frame.copy()
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
if using_array_manager:
df["foo"]["one"] = 2
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
df["foo"]["one"] = 2

result = df
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df, expected)
Loading