Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add holidays argument to business_day_count #15580

Merged
merged 5 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion crates/polars-ops/src/series/ops/business.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use ahash::HashSet;
use polars_core::prelude::arity::binary_elementwise_values;
use polars_core::prelude::*;

Expand All @@ -7,14 +8,28 @@ use polars_core::prelude::*;
/// - `start`: Series holding start dates.
/// - `end`: Series holding end dates.
/// - `week_mask`: A boolean array of length 7, where `true` indicates that the day is a business day.
/// - `holidays`: timestamps that are holidays. Must be provided as i32, i.e. the number of
/// days since the UNIX epoch.
pub fn business_day_count(
start: &Series,
end: &Series,
week_mask: [bool; 7],
holidays: &[i32],
) -> PolarsResult<Series> {
if !week_mask.iter().any(|&x| x) {
polars_bail!(ComputeError:"`week_mask` must have at least one business day");
}

// De-dupe and sort holidays, and exclude non-business days.
ritchie46 marked this conversation as resolved.
Show resolved Hide resolved
let mut holidays: Vec<i32> = holidays
.iter()
.filter(|&x| *unsafe { week_mask.get_unchecked(weekday(*x)) })
.cloned()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be copied?

.collect::<HashSet<_>>()
.into_iter()
.collect();
holidays.sort_unstable();
ritchie46 marked this conversation as resolved.
Show resolved Hide resolved

let start_dates = start.date()?;
let end_dates = end.date()?;
let n_business_days_in_week_mask = week_mask.iter().filter(|&x| *x).count() as i32;
Expand All @@ -28,6 +43,7 @@ pub fn business_day_count(
end_date,
&week_mask,
n_business_days_in_week_mask,
&holidays,
)
})
} else {
Expand All @@ -42,6 +58,7 @@ pub fn business_day_count(
end_date,
&week_mask,
n_business_days_in_week_mask,
&holidays,
)
})
} else {
Expand All @@ -54,6 +71,7 @@ pub fn business_day_count(
end_date,
&week_mask,
n_business_days_in_week_mask,
&holidays,
)
}),
};
Expand All @@ -67,6 +85,7 @@ fn business_day_count_impl(
mut end_date: i32,
week_mask: &[bool; 7],
n_business_days_in_week_mask: i32,
holidays: &[i32],
) -> i32 {
let swapped = start_date > end_date;
if swapped {
Expand All @@ -75,10 +94,19 @@ fn business_day_count_impl(
end_date += 1;
}

let holidays_begin = match holidays.binary_search(&start_date) {
Ok(x) => x,
Err(x) => x,
} as i32;
let holidays_end = match holidays.binary_search(&end_date) {
ritchie46 marked this conversation as resolved.
Show resolved Hide resolved
Ok(x) => x,
Err(x) => x,
} as i32;

let mut start_weekday = weekday(start_date);
let diff = end_date - start_date;
let whole_weeks = diff / 7;
let mut count = 0;
let mut count = -(holidays_end - holidays_begin);
count += whole_weeks * n_business_days_in_week_mask;
start_date += whole_weeks * 7;
while start_date < end_date {
Expand Down
20 changes: 15 additions & 5 deletions crates/polars-plan/src/dsl/function_expr/business.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ use crate::prelude::SeriesUdf;
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub enum BusinessFunction {
#[cfg(feature = "business")]
BusinessDayCount { week_mask: [bool; 7] },
BusinessDayCount {
week_mask: [bool; 7],
holidays: Vec<i32>,
},
}

impl Display for BusinessFunction {
Expand All @@ -30,16 +33,23 @@ impl From<BusinessFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
use BusinessFunction::*;
match func {
#[cfg(feature = "business")]
BusinessDayCount { week_mask } => {
map_as_slice!(business_day_count, week_mask)
BusinessDayCount {
week_mask,
holidays,
} => {
map_as_slice!(business_day_count, week_mask, &holidays)
},
}
}
}

#[cfg(feature = "business")]
pub(super) fn business_day_count(s: &[Series], week_mask: [bool; 7]) -> PolarsResult<Series> {
pub(super) fn business_day_count(
s: &[Series],
week_mask: [bool; 7],
holidays: &[i32],
) -> PolarsResult<Series> {
let start = &s[0];
let end = &s[1];
polars_ops::prelude::business_day_count(start, end, week_mask)
polars_ops::prelude::business_day_count(start, end, week_mask, holidays)
}
12 changes: 10 additions & 2 deletions crates/polars-plan/src/dsl/functions/business.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
use super::*;

#[cfg(feature = "dtype-date")]
pub fn business_day_count(start: Expr, end: Expr, week_mask: [bool; 7]) -> Expr {
pub fn business_day_count(
start: Expr,
end: Expr,
week_mask: [bool; 7],
holidays: Vec<i32>,
) -> Expr {
let input = vec![start, end];

Expr::Function {
input,
function: FunctionExpr::Business(BusinessFunction::BusinessDayCount { week_mask }),
function: FunctionExpr::Business(BusinessFunction::BusinessDayCount {
week_mask,
holidays,
}),
options: FunctionOptions {
allow_rename: true,
..Default::default()
Expand Down
88 changes: 62 additions & 26 deletions py-polars/polars/functions/business.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import contextlib
from datetime import date
from typing import TYPE_CHECKING, Iterable

from polars._utils.parse_expr_input import parse_as_expression
Expand All @@ -10,8 +11,6 @@
import polars.polars as plr

if TYPE_CHECKING:
from datetime import date

from polars import Expr
from polars.type_aliases import IntoExprColumn

Expand All @@ -20,6 +19,7 @@ def business_day_count(
start: date | IntoExprColumn,
end: date | IntoExprColumn,
week_mask: Iterable[bool] = (True, True, True, True, True, False, False),
holidays: Iterable[date] = (),
) -> Expr:
"""
Count the number of business days between `start` and `end` (not including `end`).
Expand All @@ -34,6 +34,19 @@ def business_day_count(
Which days of the week to count. The default is Monday to Friday.
If you wanted to count only Monday to Thursday, you would pass
`(True, True, True, True, False, False, False)`.
holidays
Holidays to exclude from the count. The Python package
`python-holidays <https://github.com/vacanza/python-holidays>`_
may come in handy here. You can install it with ``pip install holidays``,
and then, to get all Dutch holidays for years 2020-2024:

.. code-block:: python

import holidays

my_holidays = holidays.country_holidays("NL", years=range(2020, 2025))

and pass `holidays=my_holidays` when you call `business_day_count`.

Returns
-------
Expand All @@ -49,39 +62,62 @@ def business_day_count(
... }
... )
>>> df.with_columns(
... total_day_count=(pl.col("end") - pl.col("start")).dt.total_days(),
... business_day_count=pl.business_day_count("start", "end"),
... )
shape: (2, 4)
┌────────────┬────────────┬─────────────────┬────────────────────┐
│ start ┆ end ┆ total_day_count ┆ business_day_count │
│ --- ┆ --- ┆ --- ┆ ---
│ date ┆ date ┆ i64 ┆ i32 │
╞════════════╪════════════╪═════════════════╪════════════════════╡
│ 2020-01-01 ┆ 2020-01-02 ┆ 1 ┆ 1
│ 2020-01-02 ┆ 2020-01-10 ┆ 8 ┆ 6 │
└────────────┴────────────┴─────────────────┴────────────────────┘

Note how the two "count" columns differ due to the weekend (2020-01-04 - 2020-01-05)
not being counted by `business_day_count`.
shape: (2, 3)
┌────────────┬────────────┬────────────────────┐
│ start ┆ end ┆ business_day_count │
│ --- ┆ --- ┆ --- │
│ date ┆ date ┆ i32 │
╞════════════╪════════════╪════════════════════╡
│ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
│ 2020-01-02 ┆ 2020-01-10 ┆ 6 │
└────────────┴────────────┴────────────────────┘

Note how the business day count is 6 (as opposed a regular day count of 8)
due to the weekend (2020-01-04 - 2020-01-05) not being counted.

You can pass a custom weekend - for example, if you only take Sunday off:

>>> week_mask = (True, True, True, True, True, True, False)
>>> df.with_columns(
... total_day_count=(pl.col("end") - pl.col("start")).dt.total_days(),
... business_day_count=pl.business_day_count("start", "end", week_mask),
... )
shape: (2, 4)
┌────────────┬────────────┬─────────────────┬────────────────────┐
│ start ┆ end ┆ total_day_count ┆ business_day_count │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ date ┆ i64 ┆ i32 │
╞════════════╪════════════╪═════════════════╪════════════════════╡
│ 2020-01-01 ┆ 2020-01-02 ┆ 1 ┆ 1 │
│ 2020-01-02 ┆ 2020-01-10 ┆ 8 ┆ 7 │
└────────────┴────────────┴─────────────────┴────────────────────┘
shape: (2, 3)
┌────────────┬────────────┬────────────────────┐
│ start ┆ end ┆ business_day_count │
│ --- ┆ --- ┆ --- │
│ date ┆ date ┆ i32 │
╞════════════╪════════════╪════════════════════╡
│ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
│ 2020-01-02 ┆ 2020-01-10 ┆ 7 │
└────────────┴────────────┴────────────────────┘

You can also pass a list of holidays to exclude from the count:

>>> from datetime import date
>>> holidays = [date(2020, 1, 1), date(2020, 1, 2)]
>>> df.with_columns(
... business_day_count=pl.business_day_count("start", "end", holidays=holidays)
... )
shape: (2, 3)
┌────────────┬────────────┬────────────────────┐
│ start ┆ end ┆ business_day_count │
│ --- ┆ --- ┆ --- │
│ date ┆ date ┆ i32 │
╞════════════╪════════════╪════════════════════╡
│ 2020-01-01 ┆ 2020-01-02 ┆ 0 │
│ 2020-01-02 ┆ 2020-01-10 ┆ 5 │
└────────────┴────────────┴────────────────────┘
"""
start_pyexpr = parse_as_expression(start)
end_pyexpr = parse_as_expression(end)
return wrap_expr(plr.business_day_count(start_pyexpr, end_pyexpr, week_mask))
unix_epoch = date(1970, 1, 1)
return wrap_expr(
plr.business_day_count(
start_pyexpr,
end_pyexpr,
week_mask,
[(holiday - unix_epoch).days for holiday in holidays],
)
)
9 changes: 7 additions & 2 deletions py-polars/src/functions/business.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@ use pyo3::prelude::*;
use crate::PyExpr;

#[pyfunction]
pub fn business_day_count(start: PyExpr, end: PyExpr, week_mask: [bool; 7]) -> PyExpr {
pub fn business_day_count(
start: PyExpr,
end: PyExpr,
week_mask: [bool; 7],
holidays: Vec<i32>,
) -> PyExpr {
let start = start.inner;
let end = end.inner;
dsl::business_day_count(start, end, week_mask).into()
dsl::business_day_count(start, end, week_mask, holidays).into()
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,26 @@
min_size=7,
max_size=7,
),
holidays=st.lists(
st.dates(min_value=dt.date(1969, 1, 1), max_value=dt.date(1970, 12, 31)),
min_size=0,
max_size=100,
),
)
def test_against_np_busday_count(
start: dt.date,
end: dt.date,
week_mask: tuple[bool, ...],
start: dt.date, end: dt.date, week_mask: tuple[bool, ...], holidays: list[dt.date]
) -> None:
assume(any(week_mask))
result = (
pl.DataFrame({"start": [start], "end": [end]})
.select(n=pl.business_day_count("start", "end", week_mask=week_mask))["n"]
.select(
n=pl.business_day_count(
"start", "end", week_mask=week_mask, holidays=holidays
)
)["n"]
.item()
)
expected = np.busday_count(start, end, weekmask=week_mask)
expected = np.busday_count(start, end, weekmask=week_mask, holidays=holidays)
if start > end and parse_version(np.__version__) < parse_version("1.25"):
# Bug in old versions of numpy
reject()
Expand Down
Loading