Prepare for DF50 (apache#1231)

timsaucer · web-flow · commit bf22c1d4c130 · 2025-09-17T12:58:39.000-04:00
* ruff fmt

* Upgrade to DF50 release candidate

* Add support for passing filter and distinct in window()

* Update documentation to not use deprecated window fn

* Remove crates.io patch

* Cargo update
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,15 +34,15 @@ protoc = [ "datafusion-substrait/protoc" ]
 substrait = ["dep:datafusion-substrait"]
 
 [dependencies]
-tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] }
-pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] }
-pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]}
+tokio = { version = "1.47", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py39"] }
+pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"]}
 pyo3-log = "0.12.4"
-arrow = { version = "55.1.0", features = ["pyarrow"] }
-datafusion = { version = "49.0.2", features = ["avro", "unicode_expressions"] }
-datafusion-substrait = { version = "49.0.2", optional = true }
-datafusion-proto = { version = "49.0.2" }
-datafusion-ffi = { version = "49.0.2" }
+arrow = { version = "56", features = ["pyarrow"] }
+datafusion = { version = "50", features = ["avro", "unicode_expressions"] }
+datafusion-substrait = { version = "50", optional = true }
+datafusion-proto = { version = "50" }
+datafusion-ffi = { version = "50" }
 prost = "0.13.1" # keep in line with `datafusion-substrait`
 uuid = { version = "1.18", features = ["v4"] }
 mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
@@ -54,7 +54,7 @@ log = "0.4.27"
 
 [build-dependencies]
 prost-types = "0.13.1" # keep in line with `datafusion-substrait`
-pyo3-build-config = "0.24"
+pyo3-build-config = "0.25"
 
 [lib]
 name = "datafusion_python"
diff --git a/benchmarks/max_cpu_usage.py b/benchmarks/max_cpu_usage.py
@@ -53,7 +53,7 @@
 
 def main(num_rows: int, partitions: int) -> None:
     """Run a simple aggregation after repartitioning.
-    
+
     This function demonstrates basic partitioning concepts using synthetic data.
     Real-world performance will depend on your specific data sources, query types,
     and system configuration.
diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst
@@ -31,7 +31,7 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples.
 .. ipython:: python
 
     from datafusion import SessionContext
-    from datafusion import col
+    from datafusion import col, lit
     from datafusion import functions as f
 
     ctx = SessionContext()
@@ -120,16 +120,14 @@ two preceding rows.
 
 .. ipython:: python
 
-    from datafusion.expr import WindowFrame
+    from datafusion.expr import Window, WindowFrame
 
     df.select(
         col('"Name"'),
         col('"Speed"'),
-        f.window("avg",
-            [col('"Speed"')],
-            order_by=[col('"Speed"')],
-            window_frame=WindowFrame("rows", 2, 0)
-        ).alias("Previous Speed")
+        f.avg(col('"Speed"'))
+        .over(Window(window_frame=WindowFrame("rows", 2, 0), order_by=[col('"Speed"')]))
+        .alias("Previous Speed"),
     )
 
 Null Treatment
@@ -151,21 +149,27 @@ it's ``Type 2`` column that are null.
 
     from datafusion.common import NullTreatment
 
-    df.filter(col('"Type 1"') ==  lit("Bug")).select(
+    df.filter(col('"Type 1"') == lit("Bug")).select(
         '"Name"',
         '"Type 2"',
-        f.window("last_value", [col('"Type 2"')])
-            .window_frame(WindowFrame("rows", None, 0))
-            .order_by(col('"Speed"'))
-            .null_treatment(NullTreatment.IGNORE_NULLS)
-            .build()
-            .alias("last_wo_null"),
-        f.window("last_value", [col('"Type 2"')])
-            .window_frame(WindowFrame("rows", None, 0))
-            .order_by(col('"Speed"'))
-            .null_treatment(NullTreatment.RESPECT_NULLS)
-            .build()
-            .alias("last_with_null")
+        f.last_value(col('"Type 2"'))
+        .over(
+            Window(
+                window_frame=WindowFrame("rows", None, 0),
+                order_by=[col('"Speed"')],
+                null_treatment=NullTreatment.IGNORE_NULLS,
+            )
+        )
+        .alias("last_wo_null"),
+        f.last_value(col('"Type 2"'))
+        .over(
+            Window(
+                window_frame=WindowFrame("rows", None, 0),
+                order_by=[col('"Speed"')],
+                null_treatment=NullTreatment.RESPECT_NULLS,
+            )
+        )
+        .alias("last_with_null"),
     )
 
 Aggregate Functions
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -32,8 +32,14 @@
     WindowFrame,
     expr_list_to_raw_expr_list,
     sort_list_to_raw_sort_list,
+    sort_or_default,
 )
 
+try:
+    from warnings import deprecated  # Python 3.13+
+except ImportError:
+    from typing_extensions import deprecated  # Python 3.12
+
 if TYPE_CHECKING:
     from datafusion.context import SessionContext
 
@@ -426,12 +432,15 @@ def when(when: Expr, then: Expr) -> CaseBuilder:
     return CaseBuilder(f.when(when.expr, then.expr))
 
 
+@deprecated("Prefer to call Expr.over() instead")
 def window(
     name: str,
     args: list[Expr],
     partition_by: list[Expr] | Expr | None = None,
     order_by: list[SortKey] | SortKey | None = None,
     window_frame: WindowFrame | None = None,
+    filter: Expr | None = None,
+    distinct: bool = False,
     ctx: SessionContext | None = None,
 ) -> Expr:
     """Creates a new Window function expression.
@@ -451,7 +460,19 @@ def window(
     order_by_raw = sort_list_to_raw_sort_list(order_by)
     window_frame = window_frame.window_frame if window_frame is not None else None
     ctx = ctx.ctx if ctx is not None else None
-    return Expr(f.window(name, args, partition_by_raw, order_by_raw, window_frame, ctx))
+    filter_raw = filter.expr if filter is not None else None
+    return Expr(
+        f.window(
+            name,
+            args,
+            partition_by=partition_by_raw,
+            order_by=order_by_raw,
+            window_frame=window_frame,
+            ctx=ctx,
+            filter=filter_raw,
+            distinct=distinct,
+        )
+    )
 
 
 # scalar functions
@@ -1664,7 +1685,7 @@ def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr:
 
 
 def approx_percentile_cont(
-    expression: Expr,
+    sort_expression: Expr | SortExpr,
     percentile: float,
     num_centroids: Optional[int] = None,
     filter: Optional[Expr] = None,
@@ -1685,21 +1706,26 @@ def approx_percentile_cont(
     the options ``order_by``, ``null_treatment``, and ``distinct``.
 
     Args:
-        expression: Values for which to find the approximate percentile
+        sort_expression: Values for which to find the approximate percentile
         percentile: This must be between 0.0 and 1.0, inclusive
         num_centroids: Max bin size for the t-digest algorithm
         filter: If provided, only compute against rows for which the filter is True
     """
+    sort_expr_raw = sort_or_default(sort_expression)
     filter_raw = filter.expr if filter is not None else None
     return Expr(
         f.approx_percentile_cont(
-            expression.expr, percentile, num_centroids=num_centroids, filter=filter_raw
+            sort_expr_raw, percentile, num_centroids=num_centroids, filter=filter_raw
         )
     )
 
 
 def approx_percentile_cont_with_weight(
-    expression: Expr, weight: Expr, percentile: float, filter: Optional[Expr] = None
+    sort_expression: Expr | SortExpr,
+    weight: Expr,
+    percentile: float,
+    num_centroids: Optional[int] = None,
+    filter: Optional[Expr] = None,
 ) -> Expr:
     """Returns the value of the weighted approximate percentile.
 
@@ -1710,16 +1736,22 @@ def approx_percentile_cont_with_weight(
     the options ``order_by``, ``null_treatment``, and ``distinct``.
 
     Args:
-        expression: Values for which to find the approximate percentile
+        sort_expression: Values for which to find the approximate percentile
         weight: Relative weight for each of the values in ``expression``
         percentile: This must be between 0.0 and 1.0, inclusive
+        num_centroids: Max bin size for the t-digest algorithm
         filter: If provided, only compute against rows for which the filter is True
 
     """
+    sort_expr_raw = sort_or_default(sort_expression)
     filter_raw = filter.expr if filter is not None else None
     return Expr(
         f.approx_percentile_cont_with_weight(
-            expression.expr, weight.expr, percentile, filter=filter_raw
+            sort_expr_raw,
+            weight.expr,
+            percentile,
+            num_centroids=num_centroids,
+            filter=filter_raw,
         )
     )
 
diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py
@@ -130,11 +130,27 @@ def test_aggregation_stats(df, agg_expr, calc_expected):
         (f.median(column("b"), filter=column("a") != 2), pa.array([5]), False),
         (f.approx_median(column("b"), filter=column("a") != 2), pa.array([5]), False),
         (f.approx_percentile_cont(column("b"), 0.5), pa.array([4]), False),
+        (
+            f.approx_percentile_cont(
+                column("b").sort(ascending=True, nulls_first=False),
+                0.5,
+                num_centroids=2,
+            ),
+            pa.array([4]),
+            False,
+        ),
         (
             f.approx_percentile_cont_with_weight(column("b"), lit(0.6), 0.5),
             pa.array([6], type=pa.float64()),
             False,
         ),
+        (
+            f.approx_percentile_cont_with_weight(
+                column("b").sort(ascending=False, nulls_first=False), lit(0.6), 0.5
+            ),
+            pa.array([6], type=pa.float64()),
+            False,
+        ),
         (
             f.approx_percentile_cont_with_weight(
                 column("b"), lit(0.6), 0.5, filter=column("a") != lit(3)
diff --git a/src/common/data_type.rs b/src/common/data_type.rs
@@ -215,6 +215,16 @@ impl DataTypeMap {
             DataType::Dictionary(_, _) => {
                 Err(PyNotImplementedError::new_err(format!("{arrow_type:?}")))
             }
+            DataType::Decimal32(precision, scale) => Ok(DataTypeMap::new(
+                DataType::Decimal32(*precision, *scale),
+                PythonType::Float,
+                SqlType::DECIMAL,
+            )),
+            DataType::Decimal64(precision, scale) => Ok(DataTypeMap::new(
+                DataType::Decimal64(*precision, *scale),
+                PythonType::Float,
+                SqlType::DECIMAL,
+            )),
             DataType::Decimal128(precision, scale) => Ok(DataTypeMap::new(
                 DataType::Decimal128(*precision, *scale),
                 PythonType::Float,
@@ -549,6 +559,8 @@ impl DataTypeMap {
             DataType::Struct(_) => "Struct",
             DataType::Union(_, _) => "Union",
             DataType::Dictionary(_, _) => "Dictionary",
+            DataType::Decimal32(_, _) => "Decimal32",
+            DataType::Decimal64(_, _) => "Decimal64",
             DataType::Decimal128(_, _) => "Decimal128",
             DataType::Decimal256(_, _) => "Decimal256",
             DataType::Map(_, _) => "Map",
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -276,7 +276,6 @@ impl PyParquetColumnOptions {
                 statistics_enabled,
                 bloom_filter_fpp,
                 bloom_filter_ndv,
-                ..Default::default()
             },
         }
     }
diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs
@@ -23,7 +23,7 @@ use std::fmt::{self, Display, Formatter};
 #[pyclass(name = "SortExpr", module = "datafusion.expr", subclass)]
 #[derive(Clone)]
 pub struct PySortExpr {
-    sort: SortExpr,
+    pub(crate) sort: SortExpr,
 }
 
 impl From<PySortExpr> for SortExpr {
diff --git a/src/functions.rs b/src/functions.rs
@@ -319,21 +319,25 @@ fn find_window_fn(
 }
 
 /// Creates a new Window function expression
+#[allow(clippy::too_many_arguments)]
 #[pyfunction]
-#[pyo3(signature = (name, args, partition_by=None, order_by=None, window_frame=None, ctx=None))]
+#[pyo3(signature = (name, args, partition_by=None, order_by=None, window_frame=None, filter=None, distinct=false, ctx=None))]
 fn window(
     name: &str,
     args: Vec<PyExpr>,
     partition_by: Option<Vec<PyExpr>>,
     order_by: Option<Vec<PySortExpr>>,
     window_frame: Option<PyWindowFrame>,
+    filter: Option<PyExpr>,
+    distinct: bool,
     ctx: Option<PySessionContext>,
 ) -> PyResult<PyExpr> {
     let fun = find_window_fn(name, ctx)?;
 
     let window_frame = window_frame
         .map(|w| w.into())
         .unwrap_or(WindowFrame::new(order_by.as_ref().map(|v| !v.is_empty())));
+    let filter = filter.map(|f| f.expr.into());
 
     Ok(PyExpr {
         expr: datafusion::logical_expr::Expr::WindowFunction(Box::new(WindowFunction {
@@ -351,6 +355,8 @@ fn window(
                     .map(|x| x.into())
                     .collect::<Vec<_>>(),
                 window_frame,
+                filter,
+                distinct,
                 null_treatment: None,
             },
         })),
@@ -649,36 +655,36 @@ aggregate_function!(approx_median);
 // aggregate_function!(grouping);
 
 #[pyfunction]
-#[pyo3(signature = (expression, percentile, num_centroids=None, filter=None))]
+#[pyo3(signature = (sort_expression, percentile, num_centroids=None, filter=None))]
 pub fn approx_percentile_cont(
-    expression: PyExpr,
+    sort_expression: PySortExpr,
     percentile: f64,
     num_centroids: Option<i64>, // enforces optional arguments at the end, currently
     filter: Option<PyExpr>,
 ) -> PyDataFusionResult<PyExpr> {
-    let args = if let Some(num_centroids) = num_centroids {
-        vec![expression.expr, lit(percentile), lit(num_centroids)]
-    } else {
-        vec![expression.expr, lit(percentile)]
-    };
-    let udaf = functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf();
-    let agg_fn = udaf.call(args);
+    let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont(
+        sort_expression.sort,
+        lit(percentile),
+        num_centroids.map(lit),
+    );
 
     add_builder_fns_to_aggregate(agg_fn, None, filter, None, None)
 }
 
 #[pyfunction]
-#[pyo3(signature = (expression, weight, percentile, filter=None))]
+#[pyo3(signature = (sort_expression, weight, percentile, num_centroids=None, filter=None))]
 pub fn approx_percentile_cont_with_weight(
-    expression: PyExpr,
+    sort_expression: PySortExpr,
     weight: PyExpr,
     percentile: f64,
+    num_centroids: Option<i64>,
     filter: Option<PyExpr>,
 ) -> PyDataFusionResult<PyExpr> {
     let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont_with_weight(
-        expression.expr,
+        sort_expression.sort,
         weight.expr,
         lit(percentile),
+        num_centroids.map(lit),
     );
 
     add_builder_fns_to_aggregate(agg_fn, None, filter, None, None)
diff --git a/src/udwf.rs b/src/udwf.rs

Original file line number	Diff line number	Diff line change
`@@ -276,7 +276,6 @@ impl PyParquetColumnOptions {`
`276`	`276`	`statistics_enabled,`
`277`	`277`	`bloom_filter_fpp,`
`278`	`278`	`bloom_filter_ndv,`
`279`		`- ..Default::default()`
`280`	`279`	`},`
`281`	`280`	`}`
`282`	`281`	`}`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ use std::fmt::{self, Display, Formatter};`
`23`	`23`	`#[pyclass(name = "SortExpr", module = "datafusion.expr", subclass)]`
`24`	`24`	`#[derive(Clone)]`
`25`	`25`	`pub struct PySortExpr {`
`26`		`- sort: SortExpr,`
	`26`	`+ pub(crate) sort: SortExpr,`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`impl From<PySortExpr> for SortExpr {`