Skip to content

feat: Add support for full join #2126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ad17b44
Implemented for core libraries
tylerriccio33 Mar 1, 2025
2154e9e
Implement non coalescing keys for arrow full joins
tylerriccio33 Mar 2, 2025
0b874c7
Remove pandas support for full join
tylerriccio33 Mar 2, 2025
0013fdd
Clarify arrow coalesce behavior as full join specific
tylerriccio33 Mar 2, 2025
cad1d1e
Add libraries to exclusion list
tylerriccio33 Mar 2, 2025
dd31087
Add full join documentation
tylerriccio33 Mar 2, 2025
19a44ed
Add pandas full join
tylerriccio33 Mar 4, 2025
a0df78d
Implement duckdb full join
tylerriccio33 Mar 4, 2025
117a3f3
Fix duckdb not working for lazyframes
tylerriccio33 Mar 4, 2025
dce56e3
Add full join to dask
tylerriccio33 Mar 5, 2025
122fa6a
Fix pandas no collision suffix
tylerriccio33 Mar 5, 2025
f1cbe30
Refine tests to cover more cases
tylerriccio33 Mar 5, 2025
5c4b97d
Abstract pandas and dask full join logic
tylerriccio33 Mar 8, 2025
5561daa
Ensure duckdb join collision consistency
tylerriccio33 Mar 8, 2025
cf096f0
Merge branch 'main' into full-join
tylerriccio33 Mar 8, 2025
f749acc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 8, 2025
daab7f1
some minor
FBruzzesi Mar 16, 2025
1dd9bdf
Merge branch 'main' into full-join
FBruzzesi Mar 18, 2025
9d0f578
support spark like
FBruzzesi Mar 18, 2025
09c0ca7
old polars (?)
FBruzzesi Mar 18, 2025
ab541aa
split test
FBruzzesi Mar 18, 2025
14a5f0c
align signature, fix spark like cross join on_ value
FBruzzesi Mar 18, 2025
f434b93
fix spark like
FBruzzesi Mar 18, 2025
d03ac8e
typing
FBruzzesi Mar 18, 2025
e6d6cb1
resolve conflicts
FBruzzesi Mar 19, 2025
8d518a8
Merge branch 'main' into full-join
FBruzzesi Mar 22, 2025
d6670a1
resolve conflicts
FBruzzesi Mar 23, 2025
f959045
Merge branch 'main' into full-join
FBruzzesi Mar 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def join(
self: Self,
other: Self,
*,
how: Literal["left", "inner", "cross", "anti", "semi"],
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
Expand All @@ -426,6 +426,7 @@ def join(
"semi": "left semi",
"inner": "inner",
"left": "left outer",
"full": "full outer",
}

if how == "cross":
Expand All @@ -450,13 +451,15 @@ def join(
.drop([key_token])
)

coalesce_keys = how != "full" # polars full join does not coalesce keys
return self._from_native_frame(
self.native.join(
other.native,
keys=left_on or [], # type: ignore[arg-type]
right_keys=right_on, # type: ignore[arg-type]
join_type=how_to_join_map[how],
right_suffix=suffix,
coalesce_keys=coalesce_keys,
),
)

Expand Down
2 changes: 1 addition & 1 deletion narwhals/_compliant/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def join(
self: Self,
other: Self,
*,
how: Literal["left", "inner", "cross", "anti", "semi"],
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
Expand Down
28 changes: 27 additions & 1 deletion narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from narwhals.typing import CompliantDataFrame
from narwhals.typing import CompliantLazyFrame
from narwhals.utils import Implementation
from narwhals.utils import _remap_full_join_keys
from narwhals.utils import check_column_exists
from narwhals.utils import check_column_names_are_unique
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import not_implemented
from narwhals.utils import parse_columns_to_drop
Expand Down Expand Up @@ -255,7 +257,7 @@ def join(
self: Self,
other: Self,
*,
how: Literal["left", "inner", "cross", "anti", "semi"],
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
Expand Down Expand Up @@ -350,6 +352,30 @@ def join(
extra.append(f"{right_key}_right")
return self._from_native_frame(result_native.drop(columns=extra))

if how == "full":
# dask does not retain keys post-join
# we must append the suffix to each key before-hand

# help mypy
assert left_on is not None # noqa: S101
assert right_on is not None # noqa: S101

right_on_mapper = _remap_full_join_keys(left_on, right_on, suffix)

other_native = other._native_frame
other_native = other_native.rename(columns=right_on_mapper)
check_column_names_are_unique(other_native.columns)
right_on = list(right_on_mapper.values()) # we now have the suffixed keys
return self._from_native_frame(
self._native_frame.merge(
other_native,
left_on=left_on,
right_on=right_on,
how="outer",
suffixes=("", suffix),
),
)

return self._from_native_frame(
self._native_frame.merge(
other._native_frame,
Expand Down
18 changes: 12 additions & 6 deletions narwhals/_duckdb/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,16 @@ def join(
self: Self,
other: Self,
*,
how: Literal["left", "inner", "cross", "anti", "semi"],
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
) -> Self:
original_alias = self._native_frame.alias

if how == "cross":
native_how = "outer" if how == "full" else how

if native_how == "cross":
if self._backend_version < (1, 1, 4):
msg = f"DuckDB>=1.1.4 is required for cross-join, found version: {self._backend_version}"
raise NotImplementedError(msg)
Expand All @@ -274,16 +276,20 @@ def join(
conditions = [
f'lhs."{left}" = rhs."{right}"' for left, right in zip(left_on, right_on)
]

condition = " and ".join(conditions)
rel = self._native_frame.set_alias("lhs").join(
other._native_frame.set_alias("rhs"), condition=condition, how=how
other._native_frame.set_alias("rhs"), condition=condition, how=native_how
)

if how in {"inner", "left", "cross"}:
if native_how in {"inner", "left", "cross", "outer"}:
select = [f'lhs."{x}"' for x in self._native_frame.columns]
for col in other._native_frame.columns:
if col in self._native_frame.columns and (
right_on is None or col not in right_on
col_in_lhs: bool = col in self._native_frame.columns
if native_how == "outer" and not col_in_lhs:
select.append(f'rhs."{col}"')
elif (native_how == "outer") or (
col_in_lhs and (right_on is None or col not in right_on)
):
select.append(f'rhs."{col}" as "{col}{suffix}"')
elif right_on is None or col not in right_on:
Expand Down
26 changes: 25 additions & 1 deletion narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from narwhals.exceptions import InvalidOperationError
from narwhals.exceptions import ShapeError
from narwhals.utils import Implementation
from narwhals.utils import _remap_full_join_keys
from narwhals.utils import check_column_exists
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import import_dtypes_module
Expand Down Expand Up @@ -600,7 +601,7 @@ def join(
self: Self,
other: Self,
*,
how: Literal["left", "inner", "cross", "anti", "semi"],
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
Expand Down Expand Up @@ -723,6 +724,29 @@ def join(
extra.append(f"{right_key}{suffix}")
return self._from_native_frame(result_native.drop(columns=extra))

if how == "full":
# Pandas coalesces keys in full joins unless there's no collision

# help mypy
assert left_on is not None # noqa: S101
assert right_on is not None # noqa: S101

right_on_mapper = _remap_full_join_keys(left_on, right_on, suffix)

other_native = other._native_frame
other_native = other_native.rename(columns=right_on_mapper)
check_column_names_are_unique(other_native.columns)
right_on = list(right_on_mapper.values()) # we now have the suffixed keys
return self._from_native_frame(
self._native_frame.merge(
other_native,
left_on=left_on,
right_on=right_on,
how="outer",
suffixes=("", suffix),
),
)

return self._from_native_frame(
self.native.merge(
other._native_frame,
Expand Down
48 changes: 46 additions & 2 deletions narwhals/_polars/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class PolarsDataFrame:
item: Method[Any]
iter_rows: Method[Iterator[tuple[Any, ...]] | Iterator[Mapping[str, Any]]]
is_unique: Method[PolarsSeries]
join: Method[Self]
join_asof: Method[Self]
rename: Method[Self]
row: Method[tuple[Any, ...]]
Expand Down Expand Up @@ -417,14 +416,36 @@ def pivot(
def to_polars(self: Self) -> pl.DataFrame:
return self.native

def join(
self: Self,
other: Self,
*,
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
) -> Self:
how_native = (
"outer" if (self._backend_version < (1, 0, 0) and how == "full") else how
)

return self._from_native_frame(
self._native_frame.join(
other=other._native_frame,
how=how_native, # type: ignore[arg-type]
left_on=left_on,
right_on=right_on,
suffix=suffix,
)
)


class PolarsLazyFrame:
drop_nulls: Method[Self]
explode: Method[Self]
filter: Method[Self]
gather_every: Method[Self]
head: Method[Self]
join: Method[Self]
join_asof: Method[Self]
rename: Method[Self]
select: Method[Self]
Expand Down Expand Up @@ -616,3 +637,26 @@ def simple_select(self, *column_names: str) -> Self:

def aggregate(self: Self, *exprs: Any) -> Self:
return self.select(*exprs)

def join(
self: Self,
other: Self,
*,
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
) -> Self:
how_native = (
"outer" if (self._backend_version < (1, 0, 0) and how == "full") else how
)

return self._from_native_frame(
self._native_frame.join(
other=other._native_frame,
how=how_native, # type: ignore[arg-type]
left_on=left_on,
right_on=right_on,
suffix=suffix,
)
)
41 changes: 35 additions & 6 deletions narwhals/_spark_like/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import warnings
from functools import reduce
from operator import and_
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterator
Expand Down Expand Up @@ -328,7 +330,7 @@ def unique(
def join(
self: Self,
other: Self,
how: Literal["inner", "left", "cross", "semi", "anti"],
how: Literal["inner", "left", "full", "cross", "semi", "anti"],
left_on: Sequence[str] | None,
right_on: Sequence[str] | None,
suffix: str,
Expand All @@ -339,14 +341,23 @@ def join(
left_columns = self.columns
right_columns = other.columns

right_on_: list[str] = list(right_on) if right_on is not None else []
left_on_: list[str] = list(left_on) if left_on is not None else []

# create a mapping for columns on other
# `right_on` columns will be renamed as `left_on`
# the remaining columns will be either added the suffix or left unchanged.
right_cols_to_rename = (
[c for c in right_columns if c not in right_on_]
if how != "full"
else right_columns
)

rename_mapping = {
**dict(zip(right_on or [], left_on or [])),
**dict(zip(right_on_, left_on_)),
**{
colname: f"{colname}{suffix}" if colname in left_columns else colname
for colname in list(set(right_columns).difference(set(right_on or [])))
for colname in right_cols_to_rename
},
}
other_native = other_native.select(
Expand All @@ -363,12 +374,30 @@ def join(
[
rename_mapping[colname]
for colname in right_columns
if colname not in (right_on or [])
if colname not in right_on_
]
)
on = list(left_on) if left_on else None
elif how == "full":
col_order.extend(rename_mapping.values())

right_on_remapped = [rename_mapping[c] for c in right_on_]
on_ = (
reduce(
and_,
(
getattr(self_native, left_key) == getattr(other_native, right_key)
for left_key, right_key in zip(left_on_, right_on_remapped)
),
)
if how == "full"
else None
if how == "cross"
else left_on_
)
how_native = "full_outer" if how == "full" else how

return self._from_native_frame(
self_native.join(other_native, on=on, how=how).select(col_order)
self_native.join(other_native, on=on_, how=how_native).select(col_order)
)

def explode(self: Self, columns: Sequence[str]) -> Self:
Expand Down
12 changes: 8 additions & 4 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def join(
self: Self,
other: Self,
on: str | list[str] | None = None,
how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
how: Literal["inner", "left", "full", "cross", "semi", "anti"] = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
Expand All @@ -247,7 +247,9 @@ def join(
left_on = [left_on] if isinstance(left_on, str) else left_on
right_on = [right_on] if isinstance(right_on, str) else right_on

if how not in (_supported_joins := ("inner", "left", "cross", "anti", "semi")):
if how not in (
_supported_joins := ("inner", "left", "full", "cross", "anti", "semi")
):
msg = f"Only the following join strategies are supported: {_supported_joins}; found '{how}'."
raise NotImplementedError(msg)

Expand Down Expand Up @@ -1622,7 +1624,7 @@ def join(
self: Self,
other: Self,
on: str | list[str] | None = None,
how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
how: Literal["inner", "left", "full", "cross", "semi", "anti"] = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
Expand All @@ -1638,6 +1640,7 @@ def join(

* *inner*: Returns rows that have matching values in both tables.
* *left*: Returns all rows from the left table, and the matched rows from the right table.
* *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys.
* *cross*: Returns the Cartesian product of rows from both tables.
* *semi*: Filter rows that have a match in the right table.
* *anti*: Filter rows that do not have a match in the right table.
Expand Down Expand Up @@ -2909,7 +2912,7 @@ def join(
self: Self,
other: Self,
on: str | list[str] | None = None,
how: Literal["inner", "left", "cross", "semi", "anti"] = "inner",
how: Literal["inner", "left", "full", "cross", "semi", "anti"] = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
Expand All @@ -2925,6 +2928,7 @@ def join(

* *inner*: Returns rows that have matching values in both tables.
* *left*: Returns all rows from the left table, and the matched rows from the right table.
* *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys.
* *cross*: Returns the Cartesian product of rows from both tables.
* *semi*: Filter rows that have a match in the right table.
* *anti*: Filter rows that do not have a match in the right table.
Expand Down
Loading
Loading