Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Aggregate functions to take builder parameters #859

Merged
merged 35 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
1825394
Add NullTreatment enum wrapper and add filter option to approx_distinct
timsaucer Sep 4, 2024
3b96b9d
Small usability on aggregate
timsaucer Sep 4, 2024
c434b4e
Adding documentation and additional unit test for approx_median
timsaucer Sep 6, 2024
1a5e138
Update approx_percentil_cont with builder parameters it uses, which i…
timsaucer Sep 6, 2024
c931c06
Update approx_percentil_cont_with_weight with builder parameters it u…
timsaucer Sep 6, 2024
2cc7c94
Update array_agg to use aggregate options
timsaucer Sep 7, 2024
52e33ac
Update builder options for avg aggregate function
timsaucer Sep 7, 2024
3701631
move bit_and bit_or to use macro to generaty python fn
timsaucer Sep 7, 2024
176092c
Update builder arguments for bitwise operators
timsaucer Sep 7, 2024
fdee791
Use macro for bool_and and bool_or
timsaucer Sep 7, 2024
4f93736
Update python wrapper for arguments appropriate to bool operators
timsaucer Sep 7, 2024
62f3d2c
Set corr to use macro for pyfunction
timsaucer Sep 7, 2024
32d8ddd
Update unit test to make it easier to debug
timsaucer Sep 7, 2024
9543626
Update corr python wrapper to expose only builder parameters used
timsaucer Sep 7, 2024
8d16a3c
Update count and count_star to use macro for exposing
timsaucer Sep 7, 2024
55ebc17
Update count and count_star with approprate aggregation options
timsaucer Sep 7, 2024
7e42e6c
Move covar_pop and covar_samp to use macro for aggregates
timsaucer Sep 7, 2024
ceb65c6
Updateing covar_pop and covar_samp with builder option
timsaucer Sep 7, 2024
b7262ba
Use macro for last_value and move first_value to be near it
timsaucer Sep 7, 2024
91e5f7d
Update first_value and last_value with the builder parameters that ar…
timsaucer Sep 7, 2024
fde7e70
Remove grouping since it is not actually implemented upstream
timsaucer Sep 7, 2024
22826d4
Move median to use macro
timsaucer Sep 7, 2024
85df127
Expose builder options for median
timsaucer Sep 7, 2024
3296e1a
Expose nth value
timsaucer Sep 7, 2024
a0e24b4
Updating linear regression functions to use filter and macro
timsaucer Sep 8, 2024
2325223
Update stddev and stddev_pop to use filter and macro
timsaucer Sep 8, 2024
6be2094
Expose string_agg
timsaucer Sep 8, 2024
6420f07
Add string_agg to python wrappers and add unit test
timsaucer Sep 8, 2024
529de88
Switch sum to use macro in rust side and expose correct options in py…
timsaucer Sep 8, 2024
e352ee3
Use macro for exposing var_pop and var_samp
timsaucer Sep 8, 2024
1857468
Add unit tests for filtering on var_pop and var_samp
timsaucer Sep 8, 2024
7148dcb
Move approximation functions to use macro when possible
timsaucer Sep 8, 2024
b55ff88
Update user documentation to explain in detail the options for aggreg…
timsaucer Sep 8, 2024
ba09df1
Update unit test to handle Python 3.10
timsaucer Sep 8, 2024
62ab0ea
Clean up commented code
timsaucer Sep 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update first_value and last_value with the builder parameters that ar…
…e relevant
  • Loading branch information
timsaucer committed Sep 7, 2024
commit 91e5f7d2fd72a1e8348504b01ab7bd7e056aaf15
46 changes: 30 additions & 16 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1864,51 +1864,65 @@ def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr:


def first_value(
arg: Expr,
distinct: bool = False,
expression: Expr,
filter: Optional[Expr] = None,
order_by: Optional[list[Expr]] = None,
null_treatment: Optional[NullTreatment] = None,
null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS,
) -> Expr:
"""Returns the first value in a group of values."""
"""Returns the first value in a group of values.

This aggregate function will return the first value in the partition.

If using the builder functions described in ref:`_aggregation` this function ignores
the option ``distinct``.

Args:
expression: Argument to perform bitwise calculation on
filter: If provided, only compute against rows for which the filter is true
order_by: Set the ordering of the expression to evaluate
null_treatment: Assign whether to respect or ignull null values.
"""
order_by_raw = expr_list_to_raw_expr_list(order_by)
filter_raw = filter.expr if filter is not None else None
null_treatment_raw = null_treatment.value if null_treatment is not None else None

return Expr(
f.first_value(
arg.expr,
distinct=distinct,
expression.expr,
filter=filter_raw,
order_by=order_by_raw,
null_treatment=null_treatment_raw,
null_treatment=null_treatment.value,
)
)


def last_value(
arg: Expr,
distinct: bool = False,
expression: Expr,
filter: Optional[Expr] = None,
order_by: Optional[list[Expr]] = None,
null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS,
) -> Expr:
"""Returns the last value in a group of values.

To set parameters on this expression, use ``.order_by()``, ``.distinct()``,
``.filter()``, or ``.null_treatment()``.
This aggregate function will return the last value in the partition.

If using the builder functions described in ref:`_aggregation` this function ignores
the option ``distinct``.

Args:
expression: Argument to perform bitwise calculation on
filter: If provided, only compute against rows for which the filter is true
order_by: Set the ordering of the expression to evaluate
null_treatment: Assign whether to respect or ignull null values.
"""
order_by_raw = expr_list_to_raw_expr_list(order_by)
filter_raw = filter.expr if filter is not None else None
null_treatment_raw = null_treatment.value if null_treatment is not None else None

return Expr(
f.last_value(
arg.expr,
distinct=distinct,
expression.expr,
filter=filter_raw,
order_by=order_by_raw,
null_treatment=null_treatment_raw,
null_treatment=null_treatment.value,
)
)

Expand Down
83 changes: 83 additions & 0 deletions python/datafusion/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from datafusion import SessionContext, column, lit
from datafusion import functions as f
from datafusion.common import NullTreatment


@pytest.fixture
Expand All @@ -41,6 +42,23 @@ def df():
return ctx.create_dataframe([[batch]])


@pytest.fixture
def df_partitioned():
ctx = SessionContext()

# create a RecordBatch and a new DataFrame from it
batch = pa.RecordBatch.from_arrays(
[
pa.array([0, 1, 2, 3, 4, 5, 6]),
pa.array([7, None, 7, 8, 9, None, 9]),
pa.array(["A", "A", "A", "A", "B", "B", "B"]),
],
names=["a", "b", "c"],
)

return ctx.create_dataframe([[batch]])


@pytest.fixture
def df_aggregate_100():
ctx = SessionContext()
Expand Down Expand Up @@ -256,3 +274,68 @@ def test_bit_and_bool_fns(df, name, expr, result):
}

assert df.collect()[0].to_pydict() == expected


@pytest.mark.parametrize(
"name,expr,result",
[
("first_value", f.first_value(column("a")), [0, 4]),
(
"first_value_ordered",
f.first_value(column("a"), order_by=[column("a").sort(ascending=False)]),
[3, 6],
),
(
"first_value_with_null",
f.first_value(
column("b"),
order_by=[column("b").sort(ascending=True)],
null_treatment=NullTreatment.RESPECT_NULLS,
),
[None, None],
),
(
"first_value_ignore_null",
f.first_value(
column("b"),
order_by=[column("b").sort(ascending=True)],
null_treatment=NullTreatment.IGNORE_NULLS,
),
[7, 9],
),
("last_value", f.last_value(column("a")), [3, 6]),
(
"last_value_ordered",
f.last_value(column("a"), order_by=[column("a").sort(ascending=False)]),
[0, 4],
),
(
"last_value_with_null",
f.last_value(
column("b"),
order_by=[column("b").sort(ascending=True, nulls_first=False)],
null_treatment=NullTreatment.RESPECT_NULLS,
),
[None, None],
),
(
"last_value_ignore_null",
f.last_value(
column("b"),
order_by=[column("b").sort(ascending=True)],
null_treatment=NullTreatment.IGNORE_NULLS,
),
[8, 9],
),
],
)
def test_first_last_value(df_partitioned, name, expr, result) -> None:
df = df_partitioned.aggregate([column("c")], [expr.alias(name)]).sort(column("c"))
df.show()

expected = {
"c": ["A", "B"],
name: result,
}

assert df.collect()[0].to_pydict() == expected
24 changes: 0 additions & 24 deletions python/datafusion/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,30 +942,6 @@ def test_regr_funcs_df(func, expected):
assert result_df[0].column(0) == expected


def test_first_last_value(df):
df = df.aggregate(
[],
[
f.first_value(column("a")),
f.first_value(column("b")),
f.first_value(column("d")),
f.last_value(column("a")),
f.last_value(column("b")),
f.last_value(column("d")),
],
)

result = df.collect()
result = result[0]
assert result.column(0) == pa.array(["Hello"])
assert result.column(1) == pa.array([4])
assert result.column(2) == pa.array([datetime(2022, 12, 31)])
assert result.column(3) == pa.array(["!"])
assert result.column(4) == pa.array([6])
assert result.column(5) == pa.array([datetime(2020, 7, 2)])
df.show()


def test_binary_string_functions(df):
df = df.select(
f.encode(column("a"), literal("base64")),
Expand Down
Loading