docs: Add non-equi joins to, and revise, joins docs page (#19127)

pola-rs · Oct 10, 2024 · 4344d21 · 4344d21
1 parent 48bc09b
commit 4344d21
Show file tree

Hide file tree

Showing 8 changed files with 548 additions and 395 deletions.
diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml
@@ -376,6 +376,8 @@ docs-selection = [
   "is_last_distinct",
   "asof_join",
   "cross_join",
+  "semi_anti_join",
+  "iejoin",
   "concat_str",
   "string_reverse",
   "string_to_integer",

diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml
@@ -102,6 +102,7 @@ python:
     name: execute
     link: https://docs.pola.rs/api/python/stable/reference/sql/api/polars.SQLContext.execute.html
   join_asof: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html
+  join_where: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html
   concat: https://docs.pola.rs/api/python/stable/reference/api/polars.concat.html
   pivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.pivot.html
   unpivot: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unpivot.html
@@ -180,6 +181,11 @@ rust:
     link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic
     feature_flags: [dynamic_group_by]
   join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join
+  join-semi_anti_join_flag:
+    name: join
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join
+    feature_flags: ["semi_anti_join"]
+
   vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack
   concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html
 
@@ -193,7 +199,18 @@ rust:
   pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html
   unpivot: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unpivot
   upsample: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.upsample
-  join_asof: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof
+  join_asof_by:
+    name: join_asof_by
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by
+    feature_flags: ['asof_join']
+  join_where:
+    name: join_where
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.JoinBuilder.html#method.join_where
+    feature_flags: ["iejoin"]
+  cross_join:
+    name: cross_join
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/struct.LazyFrame.html#method.cross_join
+    feature_flags: [cross_join]
   unnest: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.unnest
 
   read_csv:

diff --git a/docs/source/_build/scripts/macro.py b/docs/source/_build/scripts/macro.py
@@ -1,10 +1,12 @@
 from collections import OrderedDict
 import os
-from typing import List, Optional, Set
+from typing import Any, List, Optional, Set
 import yaml
 import logging
 
 
+from mkdocs_macros.plugin import MacrosPlugin
+
 # Supported Languages and their metadata
 LANGUAGES = OrderedDict(
     python={
@@ -130,7 +132,7 @@ def code_tab(
     """
 
 
-def define_env(env):
+def define_env(env: MacrosPlugin) -> None:
     @env.macro
     def code_header(
         language: str, section: str = [], api_functions: List[str] = []
@@ -154,7 +156,11 @@ def code_header(
 
     @env.macro
     def code_block(
-        path: str, section: str = None, api_functions: List[str] = None
+        path: str,
+        section: str = None,
+        api_functions: List[str] = None,
+        python_api_functions: List[str] = None,
+        rust_api_functions: List[str] = None,
     ) -> str:
         """Dynamically generate a code block for the code located under {language}/path
 
@@ -170,8 +176,14 @@ def code_block(
         for language, info in LANGUAGES.items():
             base_path = f"{language}/{path}{info['extension']}"
             full_path = "docs/source/src/" + base_path
+            if language == "python":
+                extras = python_api_functions or []
+            else:
+                extras = rust_api_functions or []
             # Check if file exists for the language
             if os.path.exists(full_path):
-                result.append(code_tab(base_path, section, info, api_functions))
+                result.append(
+                    code_tab(base_path, section, info, api_functions + extras)
+                )
 
         return "\n".join(result)
diff --git a/docs/source/development/contributing/index.md b/docs/source/development/contributing/index.md
@@ -268,6 +268,13 @@ df = pl.read_parquet("file.parquet")
 
 The snippet is delimited by `--8<-- [start:<snippet_name>]` and `--8<-- [end:<snippet_name>]`. The snippet name must match the name given in the second argument to `code_block` above.
 
+In some cases, you may need to add links to different functions for the Python and Rust APIs.
+When that is the case, you can use the two extra optional arguments that `code_block` accepts, that can be used to pass Python-only and Rust-only links:
+
+```
+{{code_block('path', 'snippet_name', ['common_api_links'], ['python_only_links'], ['rust_only_links'])}}
+```
+
 #### Linting
 
 Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to lint the markdown files.

diff --git a/docs/source/src/python/user-guide/transformations/joins.py b/docs/source/src/python/user-guide/transformations/joins.py
@@ -1,117 +1,138 @@
-# --8<-- [start:setup]
+# --8<-- [start:prep-data]
+import pathlib
+import requests
+
+
+DATA = [
+    (
+        "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_groups.csv",
+        "docs/assets/data/monopoly_props_groups.csv",
+    ),
+    (
+        "https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/data/monopoly_props_prices.csv",
+        "docs/assets/data/monopoly_props_prices.csv",
+    ),
+]
+
+
+for url, dest in DATA:
+    if pathlib.Path(dest).exists():
+        continue
+    with open(dest, "wb") as f:
+        f.write(requests.get(url, timeout=10).content)
+# --8<-- [end:prep-data]
+
+# --8<-- [start:props_groups]
 import polars as pl
-from datetime import datetime
-
-# --8<-- [end:setup]
-
-# --8<-- [start:innerdf]
-df_customers = pl.DataFrame(
-    {
-        "customer_id": [1, 2, 3],
-        "name": ["Alice", "Bob", "Charlie"],
-    }
-)
-print(df_customers)
-# --8<-- [end:innerdf]
-
-# --8<-- [start:innerdf2]
-df_orders = pl.DataFrame(
-    {
-        "order_id": ["a", "b", "c"],
-        "customer_id": [1, 2, 2],
-        "amount": [100, 200, 300],
-    }
-)
-print(df_orders)
-# --8<-- [end:innerdf2]
-
 
-# --8<-- [start:inner]
-df_inner_customer_join = df_customers.join(df_orders, on="customer_id", how="inner")
-print(df_inner_customer_join)
-# --8<-- [end:inner]
+props_groups = pl.read_csv("docs/assets/data/monopoly_props_groups.csv").head(5)
+print(props_groups)
+# --8<-- [end:props_groups]
 
-# --8<-- [start:left]
-df_left_join = df_customers.join(df_orders, on="customer_id", how="left")
-print(df_left_join)
-# --8<-- [end:left]
+# --8<-- [start:props_prices]
+props_prices = pl.read_csv("docs/assets/data/monopoly_props_prices.csv").head(5)
+print(props_prices)
+# --8<-- [end:props_prices]
 
-# --8<-- [start:right]
-df_right_join = df_orders.join(df_customers, on="customer_id", how="right")
-print(df_right_join)
-# --8<-- [end:right]
+# --8<-- [start:equi-join]
+result = props_groups.join(props_prices, on="property_name")
+print(result)
+# --8<-- [end:equi-join]
 
-# --8<-- [start:full]
-df_outer_join = df_customers.join(df_orders, on="customer_id", how="full")
-print(df_outer_join)
-# --8<-- [end:full]
-
-# --8<-- [start:full_coalesce]
-df_outer_coalesce_join = df_customers.join(
-    df_orders, on="customer_id", how="full", coalesce=True
+# --8<-- [start:props_groups2]
+props_groups2 = props_groups.with_columns(
+    pl.col("property_name").str.to_lowercase(),
 )
-print(df_outer_coalesce_join)
-# --8<-- [end:full_coalesce]
+print(props_groups2)
+# --8<-- [end:props_groups2]
 
-# --8<-- [start:df3]
-df_colors = pl.DataFrame(
-    {
-        "color": ["red", "blue", "green"],
-    }
+# --8<-- [start:props_prices2]
+props_prices2 = props_prices.select(
+    pl.col("property_name").alias("name"), pl.col("cost")
 )
-print(df_colors)
-# --8<-- [end:df3]
-
-# --8<-- [start:df4]
-df_sizes = pl.DataFrame(
-    {
-        "size": ["S", "M", "L"],
-    }
+print(props_prices2)
+# --8<-- [end:props_prices2]
+
+# --8<-- [start:join-key-expression]
+result = props_groups2.join(
+    props_prices2,
+    left_on="property_name",
+    right_on=pl.col("name").str.to_lowercase(),
+)
+print(result)
+# --8<-- [end:join-key-expression]
+
+# --8<-- [start:inner-join]
+result = props_groups.join(props_prices, on="property_name", how="inner")
+print(result)
+# --8<-- [end:inner-join]
+
+# --8<-- [start:left-join]
+result = props_groups.join(props_prices, on="property_name", how="left")
+print(result)
+# --8<-- [end:left-join]
+
+# --8<-- [start:right-join]
+result = props_groups.join(props_prices, on="property_name", how="right")
+print(result)
+# --8<-- [end:right-join]
+
+# --8<-- [start:left-right-join-equals]
+print(
+    result.equals(
+        props_prices.join(
+            props_groups,
+            on="property_name",
+            how="left",
+            # Reorder the columns to match the order from above.
+        ).select(pl.col("group"), pl.col("property_name"), pl.col("cost"))
+    )
 )
-print(df_sizes)
-# --8<-- [end:df4]
+# --8<-- [end:left-right-join-equals]
+
+# --8<-- [start:full-join]
+result = props_groups.join(props_prices, on="property_name", how="full")
+print(result)
+# --8<-- [end:full-join]
+
+# --8<-- [start:full-join-coalesce]
+result = props_groups.join(
+    props_prices,
+    on="property_name",
+    how="full",
+    coalesce=True,
+)
+print(result)
+# --8<-- [end:full-join-coalesce]
 
-# --8<-- [start:cross]
-df_cross_join = df_colors.join(df_sizes, how="cross")
-print(df_cross_join)
-# --8<-- [end:cross]
+# --8<-- [start:semi-join]
+result = props_groups.join(props_prices, on="property_name", how="semi")
+print(result)
+# --8<-- [end:semi-join]
 
-# --8<-- [start:df5]
-df_cars = pl.DataFrame(
-    {
-        "id": ["a", "b", "c"],
-        "make": ["ford", "toyota", "bmw"],
-    }
-)
-print(df_cars)
-# --8<-- [end:df5]
+# --8<-- [start:anti-join]
+result = props_groups.join(props_prices, on="property_name", how="anti")
+print(result)
+# --8<-- [end:anti-join]
 
-# --8<-- [start:df6]
-df_repairs = pl.DataFrame(
+# --8<-- [start:players]
+players = pl.DataFrame(
     {
-        "id": ["c", "c"],
-        "cost": [100, 200],
+        "name": ["Alice", "Bob"],
+        "cash": [78, 135],
     }
 )
-print(df_repairs)
-# --8<-- [end:df6]
-
-# --8<-- [start:inner2]
-df_inner_join = df_cars.join(df_repairs, on="id", how="inner")
-print(df_inner_join)
-# --8<-- [end:inner2]
+print(players)
+# --8<-- [end:players]
 
-# --8<-- [start:semi]
-df_semi_join = df_cars.join(df_repairs, on="id", how="semi")
-print(df_semi_join)
-# --8<-- [end:semi]
+# --8<-- [start:non-equi]
+result = players.join_where(props_prices, pl.col("cash") > pl.col("cost"))
+print(result)
+# --8<-- [end:non-equi]
 
-# --8<-- [start:anti]
-df_anti_join = df_cars.join(df_repairs, on="id", how="anti")
-print(df_anti_join)
-# --8<-- [end:anti]
+# --8<-- [start:df_trades]
+from datetime import datetime
 
-# --8<-- [start:df7]
 df_trades = pl.DataFrame(
     {
         "time": [
@@ -125,9 +146,9 @@
     }
 )
 print(df_trades)
-# --8<-- [end:df7]
+# --8<-- [end:df_trades]
 
-# --8<-- [start:df8]
+# --8<-- [start:df_quotes]
 df_quotes = pl.DataFrame(
     {
         "time": [
@@ -142,21 +163,23 @@
 )
 
 print(df_quotes)
-# --8<-- [end:df8]
-
-# --8<-- [start:asofpre]
-df_trades = df_trades.sort("time")
-df_quotes = df_quotes.sort("time")  # Set column as sorted
-# --8<-- [end:asofpre]
+# --8<-- [end:df_quotes]
 
 # --8<-- [start:asof]
 df_asof_join = df_trades.join_asof(df_quotes, on="time", by="stock")
 print(df_asof_join)
 # --8<-- [end:asof]
 
-# --8<-- [start:asof2]
+# --8<-- [start:asof-tolerance]
 df_asof_tolerance_join = df_trades.join_asof(
     df_quotes, on="time", by="stock", tolerance="1m"
 )
 print(df_asof_tolerance_join)
-# --8<-- [end:asof2]
+# --8<-- [end:asof-tolerance]
+
+# --8<-- [start:cartesian-product]
+tokens = pl.DataFrame({"monopoly_token": ["hat", "shoe", "boat"]})
+
+result = players.select(pl.col("name")).join(tokens, how="cross")
+print(result)
+# --8<-- [end:cartesian-product]