Skip to content

Commit

Permalink
docs: Add non-equi joins to, and revise, joins docs page (#19127)
Browse files Browse the repository at this point in the history
  • Loading branch information
rodrigogiraoserrao authored Oct 10, 2024
1 parent 48bc09b commit 4344d21
Show file tree
Hide file tree
Showing 8 changed files with 548 additions and 395 deletions.
2 changes: 2 additions & 0 deletions crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,8 @@ docs-selection = [
"is_last_distinct",
"asof_join",
"cross_join",
"semi_anti_join",
"iejoin",
"concat_str",
"string_reverse",
"string_to_integer",
Expand Down
19 changes: 18 additions & 1 deletion docs/source/_build/API_REFERENCE_LINKS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ python:
name: execute
link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.execute.html
join_asof: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html
join_where: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html
concat: https://docs.pola.rs/api/python/stable/reference/api/polars.concat.html
pivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.pivot.html
unpivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unpivot.html
Expand Down Expand Up @@ -180,6 +181,11 @@ rust:
link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic
feature_flags: [dynamic_group_by]
join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join
join-semi_anti_join_flag:
name: join
link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join
feature_flags: ["semi_anti_join"]

vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack
concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html

Expand All @@ -193,7 +199,18 @@ rust:
pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html
unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot
upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample
join_asof: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof
join_asof_by:
name: join_asof_by
link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by
feature_flags: ['asof_join']
join_where:
name: join_where
link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.JoinBuilder.html#method.join_where
feature_flags: ["iejoin"]
cross_join:
name: cross_join
link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.cross_join
feature_flags: [cross_join]
unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest

read_csv:
Expand Down
20 changes: 16 additions & 4 deletions docs/source/_build/scripts/macro.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from collections import OrderedDict
import os
from typing import List, Optional, Set
from typing import Any, List, Optional, Set
import yaml
import logging


from mkdocs_macros.plugin import MacrosPlugin

# Supported Languages and their metadata
LANGUAGES = OrderedDict(
python={
Expand Down Expand Up @@ -130,7 +132,7 @@ def code_tab(
"""


def define_env(env):
def define_env(env: MacrosPlugin) -> None:
@env.macro
def code_header(
language: str, section: str = [], api_functions: List[str] = []
Expand All @@ -154,7 +156,11 @@ def code_header(

@env.macro
def code_block(
path: str, section: str = None, api_functions: List[str] = None
path: str,
section: str = None,
api_functions: List[str] = None,
python_api_functions: List[str] = None,
rust_api_functions: List[str] = None,
) -> str:
"""Dynamically generate a code block for the code located under {language}/path
Expand All @@ -170,8 +176,14 @@ def code_block(
for language, info in LANGUAGES.items():
base_path = f"{language}/{path}{info['extension']}"
full_path = "docs/source/src/" + base_path
if language == "python":
extras = python_api_functions or []
else:
extras = rust_api_functions or []
# Check if file exists for the language
if os.path.exists(full_path):
result.append(code_tab(base_path, section, info, api_functions))
result.append(
code_tab(base_path, section, info, api_functions + extras)
)

return "\n".join(result)
7 changes: 7 additions & 0 deletions docs/source/development/contributing/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,13 @@ df = pl.read_parquet("file.parquet")

The snippet is delimited by `--8<-- [start:<snippet_name>]` and `--8<-- [end:<snippet_name>]`. The snippet name must match the name given in the second argument to `code_block` above.

In some cases, you may need to add links to different functions for the Python and Rust APIs.
When that is the case, you can use the two extra optional arguments that `code_block` accepts, that can be used to pass Python-only and Rust-only links:

```
{{code_block('path', 'snippet_name', ['common_api_links'], ['python_only_links'], ['rust_only_links'])}}
```

#### Linting

Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to lint the markdown files.
Expand Down
235 changes: 129 additions & 106 deletions docs/source/src/python/user-guide/transformations/joins.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,138 @@
# --8<-- [start:setup]
# --8<-- [start:prep-data]
import pathlib
import requests


DATA = [
(
"https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv",
"docs/assets/data/monopoly_props_groups.csv",
),
(
"https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv",
"docs/assets/data/monopoly_props_prices.csv",
),
]


for url, dest in DATA:
if pathlib.Path(dest).exists():
continue
with open(dest, "wb") as f:
f.write(requests.get(url, timeout=10).content)
# --8<-- [end:prep-data]

# --8<-- [start:props_groups]
import polars as pl
from datetime import datetime

# --8<-- [end:setup]

# --8<-- [start:innerdf]
df_customers = pl.DataFrame(
{
"customer_id": [1, 2, 3],
"name": ["Alice", "Bob", "Charlie"],
}
)
print(df_customers)
# --8<-- [end:innerdf]

# --8<-- [start:innerdf2]
df_orders = pl.DataFrame(
{
"order_id": ["a", "b", "c"],
"customer_id": [1, 2, 2],
"amount": [100, 200, 300],
}
)
print(df_orders)
# --8<-- [end:innerdf2]


# --8<-- [start:inner]
df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner")
print(df_inner_customer_join)
# --8<-- [end:inner]
props_groups = pl.read_csv("docs/assets/data/monopoly_props_groups.csv").head(5)
print(props_groups)
# --8<-- [end:props_groups]

# --8<-- [start:left]
df_left_join = df_customers.join(df_orders, on="customer_id", how="left")
print(df_left_join)
# --8<-- [end:left]
# --8<-- [start:props_prices]
props_prices = pl.read_csv("docs/assets/data/monopoly_props_prices.csv").head(5)
print(props_prices)
# --8<-- [end:props_prices]

# --8<-- [start:right]
df_right_join = df_orders.join(df_customers, on="customer_id", how="right")
print(df_right_join)
# --8<-- [end:right]
# --8<-- [start:equi-join]
result = props_groups.join(props_prices, on="property_name")
print(result)
# --8<-- [end:equi-join]

# --8<-- [start:full]
df_outer_join = df_customers.join(df_orders, on="customer_id", how="full")
print(df_outer_join)
# --8<-- [end:full]

# --8<-- [start:full_coalesce]
df_outer_coalesce_join = df_customers.join(
df_orders, on="customer_id", how="full", coalesce=True
# --8<-- [start:props_groups2]
props_groups2 = props_groups.with_columns(
pl.col("property_name").str.to_lowercase(),
)
print(df_outer_coalesce_join)
# --8<-- [end:full_coalesce]
print(props_groups2)
# --8<-- [end:props_groups2]

# --8<-- [start:df3]
df_colors = pl.DataFrame(
{
"color": ["red", "blue", "green"],
}
# --8<-- [start:props_prices2]
props_prices2 = props_prices.select(
pl.col("property_name").alias("name"), pl.col("cost")
)
print(df_colors)
# --8<-- [end:df3]

# --8<-- [start:df4]
df_sizes = pl.DataFrame(
{
"size": ["S", "M", "L"],
}
print(props_prices2)
# --8<-- [end:props_prices2]

# --8<-- [start:join-key-expression]
result = props_groups2.join(
props_prices2,
left_on="property_name",
right_on=pl.col("name").str.to_lowercase(),
)
print(result)
# --8<-- [end:join-key-expression]

# --8<-- [start:inner-join]
result = props_groups.join(props_prices, on="property_name", how="inner")
print(result)
# --8<-- [end:inner-join]

# --8<-- [start:left-join]
result = props_groups.join(props_prices, on="property_name", how="left")
print(result)
# --8<-- [end:left-join]

# --8<-- [start:right-join]
result = props_groups.join(props_prices, on="property_name", how="right")
print(result)
# --8<-- [end:right-join]

# --8<-- [start:left-right-join-equals]
print(
result.equals(
props_prices.join(
props_groups,
on="property_name",
how="left",
# Reorder the columns to match the order from above.
).select(pl.col("group"), pl.col("property_name"), pl.col("cost"))
)
)
print(df_sizes)
# --8<-- [end:df4]
# --8<-- [end:left-right-join-equals]

# --8<-- [start:full-join]
result = props_groups.join(props_prices, on="property_name", how="full")
print(result)
# --8<-- [end:full-join]

# --8<-- [start:full-join-coalesce]
result = props_groups.join(
props_prices,
on="property_name",
how="full",
coalesce=True,
)
print(result)
# --8<-- [end:full-join-coalesce]

# --8<-- [start:cross]
df_cross_join = df_colors.join(df_sizes, how="cross")
print(df_cross_join)
# --8<-- [end:cross]
# --8<-- [start:semi-join]
result = props_groups.join(props_prices, on="property_name", how="semi")
print(result)
# --8<-- [end:semi-join]

# --8<-- [start:df5]
df_cars = pl.DataFrame(
{
"id": ["a", "b", "c"],
"make": ["ford", "toyota", "bmw"],
}
)
print(df_cars)
# --8<-- [end:df5]
# --8<-- [start:anti-join]
result = props_groups.join(props_prices, on="property_name", how="anti")
print(result)
# --8<-- [end:anti-join]

# --8<-- [start:df6]
df_repairs = pl.DataFrame(
# --8<-- [start:players]
players = pl.DataFrame(
{
"id": ["c", "c"],
"cost": [100, 200],
"name": ["Alice", "Bob"],
"cash": [78, 135],
}
)
print(df_repairs)
# --8<-- [end:df6]

# --8<-- [start:inner2]
df_inner_join = df_cars.join(df_repairs, on="id", how="inner")
print(df_inner_join)
# --8<-- [end:inner2]
print(players)
# --8<-- [end:players]

# --8<-- [start:semi]
df_semi_join = df_cars.join(df_repairs, on="id", how="semi")
print(df_semi_join)
# --8<-- [end:semi]
# --8<-- [start:non-equi]
result = players.join_where(props_prices, pl.col("cash") > pl.col("cost"))
print(result)
# --8<-- [end:non-equi]

# --8<-- [start:anti]
df_anti_join = df_cars.join(df_repairs, on="id", how="anti")
print(df_anti_join)
# --8<-- [end:anti]
# --8<-- [start:df_trades]
from datetime import datetime

# --8<-- [start:df7]
df_trades = pl.DataFrame(
{
"time": [
Expand All @@ -125,9 +146,9 @@
}
)
print(df_trades)
# --8<-- [end:df7]
# --8<-- [end:df_trades]

# --8<-- [start:df8]
# --8<-- [start:df_quotes]
df_quotes = pl.DataFrame(
{
"time": [
Expand All @@ -142,21 +163,23 @@
)

print(df_quotes)
# --8<-- [end:df8]

# --8<-- [start:asofpre]
df_trades = df_trades.sort("time")
df_quotes = df_quotes.sort("time") # Set column as sorted
# --8<-- [end:asofpre]
# --8<-- [end:df_quotes]

# --8<-- [start:asof]
df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock")
print(df_asof_join)
# --8<-- [end:asof]

# --8<-- [start:asof2]
# --8<-- [start:asof-tolerance]
df_asof_tolerance_join = df_trades.join_asof(
df_quotes, on="time", by="stock", tolerance="1m"
)
print(df_asof_tolerance_join)
# --8<-- [end:asof2]
# --8<-- [end:asof-tolerance]

# --8<-- [start:cartesian-product]
tokens = pl.DataFrame({"monopoly_token": ["hat", "shoe", "boat"]})

result = players.select(pl.col("name")).join(tokens, how="cross")
print(result)
# --8<-- [end:cartesian-product]
Loading

0 comments on commit 4344d21

Please sign in to comment.