-
-
Notifications
You must be signed in to change notification settings - Fork 27
Make combine_similar less greedy for merge #334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
38cef66
7d01aae
a77685b
7d4d372
4f5d78f
63fa4b6
9e8f546
ec5247b
c9031ee
1751a11
3da633e
c901743
34b3441
8f41dc6
f5ace8e
fb249c8
d8843f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
import pytest | ||
from dask.dataframe.utils import assert_eq | ||
|
||
from dask_expr import from_pandas | ||
from dask_expr import Merge, from_pandas | ||
from dask_expr._expr import Projection | ||
from dask_expr._shuffle import Shuffle | ||
from dask_expr.tests._util import _backend_library | ||
|
||
# Set DataFrame backend for this module | ||
|
@@ -206,3 +208,34 @@ def test_merge_combine_similar(npartitions_left, npartitions_right): | |
expected["new"] = expected.b + expected.c | ||
expected = expected.groupby(["a", "e", "x"]).new.sum() | ||
assert_eq(query, expected) | ||
|
||
|
||
def test_merge_combine_similar_intermediate_projections(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reproducer for the behavior being targeted by this PR? This test seems to pass on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah the bug was blocked by the additional layer in #333 but shows up now |
||
pdf = lib.DataFrame( | ||
{ | ||
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | ||
"b": 1, | ||
"c": 1, | ||
} | ||
) | ||
pdf2 = lib.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "x": 1}) | ||
pdf3 = lib.DataFrame({"d": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "e": 1, "y": 1}) | ||
|
||
df = from_pandas(pdf, npartitions=2) | ||
df2 = from_pandas(pdf2, npartitions=3) | ||
df3 = from_pandas(pdf3, npartitions=3) | ||
|
||
q = df.merge(df2).merge(df3, left_on="b", right_on="d")[["b", "x", "y"]] | ||
q["new"] = q.b + q.x | ||
result = q.optimize(fuse=False) | ||
# Check that we have intermediate projections dropping unnecessary columns | ||
assert isinstance(result.expr.frame, Projection) | ||
assert isinstance(result.expr.frame.frame, Merge) | ||
assert isinstance(result.expr.frame.frame.left, Projection) | ||
assert isinstance(result.expr.frame.frame.left.frame, Shuffle) | ||
|
||
pd_result = pdf.merge(pdf2).merge(pdf3, left_on="b", right_on="d")[["b", "x", "y"]] | ||
pd_result["new"] = pd_result.b + pd_result.x | ||
|
||
assert sorted(result.expr.frame.frame.left.operand("columns")) == ["b", "x"] | ||
assert_eq(result, pd_result, check_index=False) |
Uh oh!
There was an error while loading. Please reload this page.