Fix binary operations on attrs for Series and DataFrame#59636
Conversation
fbourgey
commented
Aug 28, 2024
- closes BUG: binary operations don't propogate attrs depending on order with Series and/or DataFrame/Series #51607
- Test
- Test
WillAyd
left a comment
There was a problem hiding this comment.
Small change to prefer fixtures to writing out our own binop implementations, but generally lgtm. I don't think current CI failures are related.
@mroeschke any thoughts here?
| df_2 = DataFrame({"A": [-3, 9]}) | ||
| attrs = {"info": "DataFrame"} | ||
| df_1.attrs = attrs | ||
| assert (df_1 + df_2).attrs == attrs |
There was a problem hiding this comment.
Rather than doing this you can just use the all_binary_operators fixture from conftest.py (I think)
There was a problem hiding this comment.
I made the change.
mroeschke
left a comment
There was a problem hiding this comment.
I think attrs propagation logic should should only be handled by __finalize__, so these binary operations should dispatch to that method
|
@mroeschke should everything be rewritten using |
|
Yes, or |
|
This pull request is stale because it has been open for thirty days with no activity. Please update and respond to this comment if you're still interested in working on this. |
|
@mroeschke, @WillAyd, I tried using |
|
I think it looks good but will defer to @mroeschke |
| def _cmp_method(self, other, op): | ||
| axis: Literal[1] = 1 # only relevant for Series other case | ||
|
|
||
| if not getattr(self, "attrs", None) and getattr(other, "attrs", None): |
There was a problem hiding this comment.
I don't think we should need these anymore here since this should be handled in _construct_result
There was a problem hiding this comment.
It seems that sometimes
self, other = self._align_for_op(other, axis, flex=False, level=None)resets other.attrs to {}.
This is why I kept it.
There was a problem hiding this comment.
Is it because other is getting overridden here? Otherwise, _align_for_op should also preserve the attrs of other.
| @@ -8212,6 +8208,9 @@ def to_series(right): | |||
| ) | |||
| right = left._maybe_align_series_as_frame(right, axis) | |||
There was a problem hiding this comment.
I think this resets the attrs of right
There was a problem hiding this comment.
I would consider that a bug. attrs should be preserved in this function.
There was a problem hiding this comment.
Should I fix it in this PR or raise a different issue?
There was a problem hiding this comment.
Suggested something below
| DataFrame | ||
| """ | ||
| if not getattr(self, "attrs", None) and getattr(other, "attrs", None): | ||
| self.__finalize__(other) |
There was a problem hiding this comment.
Can we do out = out.__finalize(other) instead?
| return self._construct_result(new_data, other=other) | ||
|
|
||
| def _construct_result(self, result) -> DataFrame: | ||
| def _construct_result(self, result, other=None) -> DataFrame: |
There was a problem hiding this comment.
| def _construct_result(self, result, other=None) -> DataFrame: | |
| def _construct_result(self, result, other) -> DataFrame: |
Might as well make this required
| if not getattr(self, "attrs", None) and getattr(other, "attrs", None): | ||
| out = out.__finalize__(other) |
There was a problem hiding this comment.
| if not getattr(self, "attrs", None) and getattr(other, "attrs", None): | |
| out = out.__finalize__(other) | |
| out = out.__finalize__(other) |
Appears __finalize__ will correctly skip if other has a populated attrs
There was a problem hiding this comment.
Doing this breaks the following test:
FAILED pandas/tests/generic/test_duplicate_labels.py::TestPreserves::test_binops[other1-True-add] - AssertionError
FAILED pandas/tests/generic/test_duplicate_labels.py::TestPreserves::test_binops[other1-True-sub] - AssertionErrorThere was a problem hiding this comment.
I think this line __finalize__ needs a fix:
self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
Prioritizing False if self.flags.allows_duplicate_labels or other.flags.allows_duplicate_labels is False
There was a problem hiding this comment.
What about doing in __finalize__
if isinstance(other, NDFrame):
if other.attrs:
# We want attrs propagation to have minimal performance
# impact if attrs are not used; i.e. attrs is an empty dict.
# One could make the deepcopy unconditionally, but a deepcopy
# of an empty dict is 50x more expensive than the empty check.
self.attrs = deepcopy(other.attrs)
self.flags.allows_duplicate_labels = (
self.flags.allows_duplicate_labels
and other.flags.allows_duplicate_labels
)There was a problem hiding this comment.
Yup that's the correct location to fix this
|
|
||
| @final | ||
| def _construct_result(self, result, name): | ||
| def _construct_result(self, result, name, other=None): |
There was a problem hiding this comment.
| def _construct_result(self, result, name, other=None): | |
| def _construct_result(self, result, name, other): |
| self, | ||
| result: ArrayLike | tuple[ArrayLike, ArrayLike], | ||
| name: Hashable, | ||
| other: AnyArrayLike | DataFrame | None = None, |
There was a problem hiding this comment.
| other: AnyArrayLike | DataFrame | None = None, | |
| other: AnyArrayLike | DataFrame, |
| ---------- | ||
| result : ndarray or ExtensionArray | ||
| name : Label | ||
| other : Series, DataFrame or array-like, default None |
There was a problem hiding this comment.
| other : Series, DataFrame or array-like, default None | |
| other : Series, DataFrame or array-like |
| if getattr(other, "attrs", None): | ||
| out.__finalize__(other) |
There was a problem hiding this comment.
| if getattr(other, "attrs", None): | |
| out.__finalize__(other) | |
| out = out.__finalize__(other) |
There was a problem hiding this comment.
Doing this breaks:
FAILED pandas/tests/generic/test_duplicate_labels.py::TestPreserves::test_binops[other1-False-add] - AssertionError
FAILED pandas/tests/generic/test_duplicate_labels.py::TestPreserves::test_binops[other1-False-sub] - AssertionErrorsomething to do with flags.allows_duplicate_labels
| return self._construct_result(result, name=res_name, other=other) | ||
|
|
||
| def _construct_result(self, result, name): | ||
| def _construct_result(self, result, name, other=None): |
There was a problem hiding this comment.
| def _construct_result(self, result, name, other=None): | |
| def _construct_result(self, result, name, other): |
| left : DataFrame | ||
| right : Any | ||
| """ | ||
|
|
| "before operating." | ||
| ) | ||
|
|
||
| left, right = left.align( |
There was a problem hiding this comment.
| left, right = left.align( | |
| left, right = left.align( |
|
Thanks for sticking with this @fbourgey! |
|
Thanks for the help @mroeschke! |