-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CLN] Dispatch (some) Frame ops to Series, avoiding _data.eval #22019
Changes from all commits
5195dad
1a66906
a45abec
8594c48
108550e
07fb477
946e54a
323f45e
b3cedf1
a0708d1
5d3db89
aa41de3
01c3720
bcb6735
b3ef417
4b2e21a
330a94a
4c4f626
757e2ae
ff96c0d
17f33b6
c30f898
82a7928
52b7102
453ae8e
93887cf
dd115c8
265ec78
db5ca89
f574c24
e6821a2
da2e32c
a6a7f58
b21a8a2
858165f
31d4089
5832c2b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1666,7 +1666,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): | |
# ----------------------------------------------------------------------------- | ||
# DataFrame | ||
|
||
def dispatch_to_series(left, right, func, str_rep=None): | ||
def dispatch_to_series(left, right, func, str_rep=None, axis=None): | ||
""" | ||
Evaluate the frame operation func(left, right) by evaluating | ||
column-by-column, dispatching to the Series implementation. | ||
|
@@ -1677,6 +1677,7 @@ def dispatch_to_series(left, right, func, str_rep=None): | |
right : scalar or DataFrame | ||
func : arithmetic or comparison operator | ||
str_rep : str or None, default None | ||
axis : {None, 0, 1, "index", "columns"} | ||
|
||
Returns | ||
------- | ||
|
@@ -1700,6 +1701,15 @@ def column_op(a, b): | |
return {i: func(a.iloc[:, i], b.iloc[:, i]) | ||
for i in range(len(a.columns))} | ||
|
||
elif isinstance(right, ABCSeries) and axis == "columns": | ||
# We only get here if called via left._combine_match_columns, | ||
# in which case we specifically want to operate row-by-row | ||
assert right.index.equals(left.columns) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-comment here (and below) to disambiguate the elifs |
||
|
||
def column_op(a, b): | ||
return {i: func(a.iloc[:, i], b.iloc[i]) | ||
for i in range(len(a.columns))} | ||
|
||
elif isinstance(right, ABCSeries): | ||
assert right.index.equals(left.index) # Handle other cases later | ||
|
||
|
@@ -1844,7 +1854,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): | |
pass_op = op if should_series_dispatch(self, other, op) else na_op | ||
return self._combine_frame(other, pass_op, fill_value, level) | ||
elif isinstance(other, ABCSeries): | ||
return _combine_series_frame(self, other, na_op, | ||
# For these values of `axis`, we end up dispatching to Series op, | ||
# so do not want the masked op. | ||
pass_op = op if axis in [0, "columns", None] else na_op | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is pretty janky, can you make simpler? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really, this needs to be A few PRs down the road I expect to get rid of |
||
return _combine_series_frame(self, other, pass_op, | ||
fill_value=fill_value, axis=axis, | ||
level=level, try_cast=True) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -721,7 +721,7 @@ def test_align_int_fill_bug(self): | |
|
||
result = df1 - df1.mean() | ||
expected = df2 - df2.mean() | ||
assert_frame_equal(result, expected) | ||
assert_frame_equal(result.astype('f8'), expected) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this PR needs a note in the what's new describing this change (as its effectievly an API change) |
||
|
||
def test_align_multiindex(self): | ||
# GH 10665 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1566,8 +1566,9 @@ def test_crosstab_normalize(self): | |
full_normal) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'), | ||
row_normal) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'), | ||
col_normal) | ||
tm.assert_frame_equal( | ||
pd.crosstab(df.a, df.b, normalize='columns').astype('f8'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this changing is not great, the point of these ops is that the dtypes won't change on the pivot There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The change is happening because ATM the casting is done at the Block-level, while with the PR casting is done at the column-level.
and under the PR gives:
i.e. the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the correct way to answer this is ask "was the upcast from int to float the result of coercing to a single dtype for multiple columns?" If so, then we should preserve integer dtype where we can, i.e. For an all-integer In [27]: a = pd.Series([1, 2])
In [28]: pd.crosstab(a, a)
Out[28]:
col_0 1 2
row_0
1 1 0
2 0 1 so this change seems good (and maybe worth explicitly testing rather than astyping the output, to ensure that we don't regress)? |
||
col_normal) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1), | ||
pd.crosstab(df.a, df.b, normalize='columns')) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0), | ||
|
@@ -1600,7 +1601,8 @@ def test_crosstab_normalize(self): | |
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', | ||
margins=True), row_normal_margins) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', | ||
margins=True), col_normal_margins) | ||
margins=True).astype('f8'), | ||
col_normal_margins) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True, | ||
margins=True), all_normal_margins) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there cases where this might no be true? If not, then let's remove it, as it could be somewhat expensive (though maybe not, since the output of align shares indices.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This always holds. I added it largely so I wouldn't have to keep going back to core.ops to double-check what was being passed.