Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make combine_similar less greedy for merge #334

Merged
merged 17 commits into from
Oct 16, 2023
Prev Previous commit
Next Next commit
Fix
  • Loading branch information
phofl committed Oct 13, 2023
commit c901743f800153c6ed810e3d7636be1162948fcb
28 changes: 13 additions & 15 deletions dask_expr/_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,6 @@ def _lower(self):
or shuffle_backend is None
and get_default_shuffle_method() == "p2p"
):
# left = Repartition(left, lambda x: x // 2)
# right = Repartition(right, lambda x: x // 2)

return HashJoinP2P(
left,
right,
Expand Down Expand Up @@ -285,23 +282,24 @@ def _validate_same_operations(self, common, op, remove="both"):
columns_right,
)

@staticmethod
def _flatten_columns(expr, columns, side):
if len(columns) == 0:
return getattr(expr, side).columns
else:
return list(set(flatten(columns)))

def _combine_similar(self, root: Expr):
# Push projections back up to avoid performing the same merge multiple times

def _flatten_columns(columns, side=None):
if len(columns) == 0 and side is not None:
return getattr(self, side).columns
else:
return list(set(flatten(columns)))

left, columns_left = self._remove_operations(
self.left, self._remove_ops, self._skip_ops
)
columns_left = _flatten_columns(columns_left, "left")
columns_left = self._flatten_columns(self, columns_left, "left")
right, columns_right = self._remove_operations(
self.right, self._remove_ops, self._skip_ops
)
columns_right = _flatten_columns(columns_right, "right")
columns_right = self._flatten_columns(self, columns_right, "right")

if left._name == self.left._name and right._name == self.right._name:
# There aren't any ops we can remove, so bail
Expand All @@ -323,22 +321,22 @@ def _flatten_columns(columns, side=None):

validation = self._validate_same_operations(common_right, op, "left")
if validation[0]:
left_sub = _flatten_columns(validation[1])
left_sub = self._flatten_columns(op, validation[1], side="left")
columns = self.right.columns.copy()
columns += [col for col in self.left.columns if col not in columns]
break

validation = self._validate_same_operations(common_left, op, "right")
if validation[0]:
right_sub = _flatten_columns(validation[2])
right_sub = self._flatten_columns(op, validation[2], side="right")
columns = self.left.columns.copy()
columns += [col for col in self.right.columns if col not in columns]
break

validation = self._validate_same_operations(common_both, op)
if validation[0]:
left_sub = _flatten_columns(validation[1])
right_sub = _flatten_columns(validation[2])
left_sub = self._flatten_columns(op, validation[1], side="left")
right_sub = self._flatten_columns(op, validation[2], side="right")
columns = columns_left.copy()
columns += [col for col in columns_right if col not in columns_left]
break
Expand Down
Loading