diff --git a/.gitignore b/.gitignore index 8632ff2..0b6899e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ Cargo.lock target/ .hypothesis *.pyc +.vscode diff --git a/README.md b/README.md index a7eb006..7c4dc48 100644 --- a/README.md +++ b/README.md @@ -22,24 +22,37 @@ $ pip install polars-business To use it, you'll need to `import polars_business`, and then you'll be a `.business` accessor on your expressions! -Currently there's only a single function: `advance_n_days`. +Currently there's only a single function: `advance_n_days`. It takes arguments: +- `n`: number of days to advance. This can be an expression. +- `holidays`: list of holidays in `datetime.date` format. The Python `holidays` package may + be useful here. You can install it with `pip install holidays`, and then you can get a list + of holidays for a given country with (for example, `'UK'`): + ``` + import holidays + + list(holidays.country_holidays('UK', years=[2020, 2021, 2022, 2023])) + ``` Example ------- -Here's an example of how to shift a date range forwards by 5 business days (i.e. Monday to Friday, excluding weekends): +Given some dates, can you shift them all forwards by 5 business days (according to the UK holiday calendar)? + +With `polars-business`, this is easy: ```python +from datetime import date + +import holidays import polars as pl import polars_business -from datetime import date +uk_holidays = holidays.country_holidays('UK', years=[2023, 2024]) df = pl.DataFrame({ - "dates": pl.date_range(date(2000, 1, 1), date(9999, 1, 1), eager=True), + "dates": [date(2023, 4, 2), date(2023, 9, 1), date(2024, 1, 4)] }) -df = df.filter(pl.col('dates').dt.weekday() <6) -print(df.with_columns(dates_shifted=pl.col('dates').business.advance_n_days(n=5))) +print(df.with_columns(dates_shifted=pl.col('dates').business.advance_n_days(n=5, holidays=uk_holidays))) ``` Note @@ -50,13 +63,27 @@ What to expected ---------------- The following will hopefully come relatively soon: - support for `Datetime`s -- support for custom holiday calendars - support for rolling forwards/backwards to the next valid business date (if not already on one) Ideas for future development: - business date range -- support for custom mask +- support for custom week mask + +Benchmarks +---------- + +The following timings can be verified using the `perf.py` script. + +### Adding 17 business days to 10 million dates (no holidays) + +- Polars-business 0.058 +- NumPy 0.092 +- pandas 0.801 +### Adding 17 business days to 10 million dates (UK holidays for 2020-2023) -Currently there's only a single function: `advance_n_days`. +- Polars-business 0.406 +- NumPy 0.417 +- pandas: omitted as pandas doesn't (yet) vectorise `CustomBusinessDay`, so + we'd likely be talking about minutes diff --git a/src/Makefile b/src/Makefile index 949a2b8..1f88e68 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,24 +2,39 @@ SHELL=/bin/bash venv: ## Set up virtual environment - python3 -m venv venv - venv/bin/pip install -r requirements.txt + . ../.venv/bin/activate install: venv unset CONDA_PREFIX && \ - source venv/bin/activate && maturin develop -m polars_business/Cargo.toml + source ../.venv/bin/activate && maturin develop -m polars_business/Cargo.toml install-release: venv unset CONDA_PREFIX && \ - source venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml + source ../.venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml + +build: venv + unset CONDA_PREFIX && \ + source ../.venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml clean: - -@rm -r venv + -@rm -r ../.venv -@cd polars_business && cargo clean run: install - source venv/bin/activate && python run.py + source ../.venv/bin/activate && python run.py run-release: install-release - source venv/bin/activate && python run.py + source ../.venv/bin/activate && python run.py + +test: build + pytest ../tests + +clippy: venv + cargo clippy -p polars_business + +fmt: venv + cargo fmt --all + black . + +pre-commit: clippy fmt diff --git a/src/perf.py b/src/perf.py new file mode 100644 index 0000000..15dcdf9 --- /dev/null +++ b/src/perf.py @@ -0,0 +1,75 @@ + +import timeit +import numpy as np + +# BENCHMARK 1: NO HOLIDAYS INVOLVED + +setup = """ +import polars as pl +import polars_business +from datetime import date +import numpy as np +import pandas as pd +import holidays +import warnings + +dates = pl.date_range(date(2020, 1, 1), date(2024, 1, 1), closed='left', eager=True) +dates = dates.filter(dates.dt.weekday() < 6) +size = 10_000_000 +input_dates = np.random.choice(dates, size) + +df = pl.DataFrame({ + 'ts': input_dates, +}) + +df_pd = pd.DataFrame({ + 'ts': input_dates, +}) +""" + +def time_it(statement): + results = np.array(timeit.Timer( + stmt=statement, + setup=setup, + ) + .repeat(7, 3) + )/3 + return round(min(results), 3) + +print('Polars-business: ', time_it("result_pl = df.select(pl.col('ts').business.advance_n_days(n=17))")) + +print('NumPy: ', time_it("result_np = np.busday_offset(input_dates, 17)")) + +print('pandas: ', time_it("result_pd = df_pd['ts'] + pd.tseries.offsets.BusinessDay(17)")) + +# BENCHMARK 2: WITH HOLIDAYS + +setup = """ +import polars as pl +import polars_business +from datetime import date +import numpy as np +import pandas as pd +import holidays +import warnings + +uk_holidays = list(holidays.country_holidays('UK', years=[2020, 2021, 2022, 2023])) + +dates = pl.date_range(date(2020, 1, 1), date(2024, 1, 1), closed='left', eager=True) +dates = dates.filter(~dates.is_in(uk_holidays)) +dates = dates.filter(dates.dt.weekday() < 6) +size = 10_000_000 +input_dates = np.random.choice(dates, size) + +df = pl.DataFrame({ + 'ts': input_dates, +}) + +df_pd = pd.DataFrame({ + 'ts': input_dates, +}) +""" + +print('Polars-business: ', time_it("result_pl = df.select(pl.col('ts').business.advance_n_days(n=17, holidays=uk_holidays))")) + +print('NumPy: ', time_it("result_np = np.busday_offset(input_dates, 17, holidays=uk_holidays)")) diff --git a/src/polars_business/polars_business/__init__.py b/src/polars_business/polars_business/__init__.py index 1638120..7d7b923 100644 --- a/src/polars_business/polars_business/__init__.py +++ b/src/polars_business/polars_business/__init__.py @@ -12,14 +12,22 @@ class BusinessDayTools: def __init__(self, expr: pl.Expr): self._expr = expr.cast(pl.Int32) - - def advance_n_days(self, n) -> pl.Expr: + def advance_n_days(self, n, holidays=None) -> pl.Expr: # if not (isinstance(n, int) and n > 0): # raise ValueError("only positive integers are currently supported for `n`") - - return self._expr._register_plugin( - lib=lib, - symbol="advance_n_days", - is_elementwise=True, - args = [n], - ) + if holidays is None: + return self._expr._register_plugin( + lib=lib, + symbol="advance_n_days", + is_elementwise=True, + args=[ + n, + ], + ) + else: + return self._expr._register_plugin( + lib=lib, + symbol="advance_n_days", + is_elementwise=True, + args=[n, pl.Series([list(set(holidays))]).cast(pl.List(pl.Int32))], + ) diff --git a/src/polars_business/src/expressions.rs b/src/polars_business/src/expressions.rs index bac9a8b..7403744 100644 --- a/src/polars_business/src/expressions.rs +++ b/src/polars_business/src/expressions.rs @@ -2,54 +2,143 @@ use polars::prelude::arity::try_binary_elementwise; use polars::prelude::*; use pyo3_polars::derive::polars_expr; +fn weekday(x: i32) -> i32 { + // the first modulo might return a negative number, so we add 7 and take + // the modulo again so we're sure we have something between 0 and 6 + ((x - 4) % 7 + 7) % 7 +} + +fn calculate_n_days_without_holidays(_x: i32, n: i32, x_weekday: i32) -> i32 { + if n >= 0 { + n + (n + x_weekday) / 5 * 2 + } else { + -(-n + (-n + 4 - x_weekday) / 5 * 2) + } +} + +fn reduce_vec(vec: &[i32], x: i32, n_days: i32) -> Vec { + // Each day we skip may be a holiday, and so require skipping an additional day. + // n_days*2 is an upper-bound. + if n_days > 0 { + vec.iter() + .copied() + .filter(|&t| t >= x && t <= x + n_days * 2) + .collect() + } else { + vec.iter() + .copied() + .filter(|&t| t <= x && t >= x + n_days * 2) + .collect() + } +} + +fn increment_n_days(x: i32) -> i32 { + if x > 0 { + x + 1 + } else { + x - 1 + } +} + +fn roll(n_days: i32, weekday_res: i32) -> i32 { + if n_days > 0 { + if weekday_res == 5 { + n_days + 2 + } else if weekday_res == 6 { + n_days + 1 + } else { + n_days + } + } else if weekday_res == 5 { + n_days - 1 + } else if weekday_res == 6 { + n_days - 2 + } else { + n_days + } +} + +fn calculate_n_days(x: i32, n: i32, vec: &Vec) -> PolarsResult { + let x_mod_7 = x % 7; + let x_weekday = weekday(x_mod_7); + + if x_weekday == 5 { + polars_bail!(ComputeError: "Saturday is not a business date, cannot advance. `roll` argument coming soon.") + } else if x_weekday == 6 { + polars_bail!(ComputeError: "Sunday is not a business date, cannot advance. `roll` argument coming soon.") + } + + let mut n_days = calculate_n_days_without_holidays(x, n, x_weekday); + + if !vec.is_empty() { + let myvec: Vec = reduce_vec(vec, x, n_days); + if !myvec.is_empty() { + let mut count_hols = count_holidays(x, x + n_days, &myvec); + while count_hols > 0 { + let n_days_before = n_days; + for _ in 0..count_hols { + n_days = increment_n_days(n_days); + let weekday_res = weekday(x_mod_7 + n_days); + n_days = roll(n_days, weekday_res); + } + if n_days_before > 0 { + count_hols = count_holidays(x+n_days_before+1, x + n_days, &myvec); + } else { + count_hols = count_holidays(x+n_days_before-1, x + n_days, &myvec); + } + } + } + }; + Ok(x + n_days) +} + +fn condition(x: i32, start: i32, end: i32) -> bool { + if end > start { + x >= start && x <= end + } else { + x <= start && x >= end + } +} + +fn count_holidays(start: i32, end: i32, holidays: &[i32]) -> i32 { + holidays + .iter() + .filter(|&holiday| { + condition(*holiday, start, end) + }) + .count() as i32 +} + #[polars_expr(output_type=Date)] fn advance_n_days(inputs: &[Series]) -> PolarsResult { let ca = inputs[0].i32()?; let n_series = inputs[1].cast(&DataType::Int32)?; let n = n_series.i32()?; + let vec = if inputs.len() == 3 { + let binding = inputs[2].list()?.get(0).unwrap(); + let holidays = binding.i32()?; + let mut vec: Vec<_> = Vec::from(holidays).iter().filter_map(|&x| x).collect(); + vec.sort(); + vec + } else { + Vec::new() + }; + let vec: Vec<_> = vec + .into_iter() + .filter(|x| weekday(*x) < 5) + .collect(); + let out = match n.len() { 1 => { if let Some(n) = n.get(0) { - ca.try_apply( - |x|{ - let weekday = (x - 4) % 7; - - if weekday == 5 { - polars_bail!(ComputeError: "Saturday is not a business date, cannot advance. `roll` argument coming soon.") - } else if weekday == 6 { - polars_bail!(ComputeError: "Sunday is not a business date, cannot advance. `roll` argument coming soon.") - } - - let n_days = if n >= 0 { - n + (n + weekday) / 5 * 2 - } else { - -(-n + (-n + 4-weekday) / 5 * 2) - }; - Ok(x + n_days) - } - ) + ca.try_apply(|x| calculate_n_days(x, n, &vec)) } else { Ok(Int32Chunked::full_null(ca.name(), ca.len())) } } _ => try_binary_elementwise(ca, n, |opt_s, opt_n| match (opt_s, opt_n) { - (Some(s), Some(n)) => { - let weekday = (s - 4) % 7; - - if weekday == 5 { - polars_bail!(ComputeError: "Saturday is not a business date, cannot advance. `roll` argument coming soon.") - } else if weekday == 6 { - polars_bail!(ComputeError: "Sunday is not a business date, cannot advance. `roll` argument coming soon.") - } - - let n_days = if n >= 0 { - n + (n + weekday) / 5 * 2 - } else { - -(-n + (-n + 4 - weekday) / 5 * 2) - }; - Ok(Some(s + n_days)) - } + (Some(s), Some(n)) => calculate_n_days(s, n, &vec).map(Some), _ => Ok(None), }), }; diff --git a/src/run.py b/src/run.py index e5e90da..8279eaf 100644 --- a/src/run.py +++ b/src/run.py @@ -1,22 +1,35 @@ import polars as pl from polars_business import * -from datetime import date, datetime +from datetime import date, datetime, timedelta import numpy as np -df = pl.DataFrame({ - "dates": pl.date_range(date(2000, 1, 1), date(9999, 1, 1), eager=True), -}) -df = df.filter(pl.col('dates').dt.weekday() <6) +start = date(2000, 9, 11) +n = -29 +holidays = [date(2000, 8, 1)] +df = pl.DataFrame( + { + "dates": pl.date_range(start, start+timedelta(10), eager=True), + } +) +df = df.filter((pl.col("dates").dt.weekday() < 6) & ~pl.col("dates").is_in(holidays)) +df = df.with_columns(start_wday=pl.col("dates").dt.strftime("%a")) -print(df.head().with_columns(dates_shifted=pl.col('dates').business.advance_n_days(n=-3))[:5]) -print(df.head().with_columns(dates_shifted=pl.Series(np.busday_offset(df.head()['dates'], -3)))[:5]) - -import pandas as pd -dfpd = df.to_pandas() -print((dfpd + pd.tseries.offsets.BusinessDay(15)).iloc[20:28]) - -# Let's try to "just publish" - -# only accept: -# - date -# - a single offset +print( + df.with_columns( + dates_shifted=pl.col("dates").business.advance_n_days( + n=n, + holidays=holidays + ) + ).with_columns(end_wday=pl.col("dates_shifted").dt.strftime("%a")) +) +print( + df.with_columns( + dates_shifted=pl.Series( + np.busday_offset( + df["dates"], + n, + holidays=holidays + ) + ) + ) +) diff --git a/tests/test_business_offsets.py b/tests/test_business_offsets.py index 791d5a2..40cd644 100644 --- a/tests/test_business_offsets.py +++ b/tests/test_business_offsets.py @@ -6,11 +6,11 @@ from hypothesis import given, assume import polars as pl -import polars_business +import polars_business # noqa: F401 @given( - date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(9999, 12, 31)), + date=st.dates(min_value=dt.date(1000, 1, 1), max_value=dt.date(9999, 12, 31)), n=st.integers(min_value=-30, max_value=30), ) def test_against_np_busday_offset(date: dt.date, n: int) -> None: @@ -41,3 +41,18 @@ def test_bday_n_expression(date: dt.date, n: int) -> None: expected = pd.Timestamp(date) + pd.tseries.offsets.BusinessDay(n) assert pd.Timestamp(result) == expected + +@given( + date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + n=st.integers(min_value=-30, max_value=30), + holidays = st.lists(st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), min_size=1, max_size=100) +) +def test_against_np_busday_offset_with_holidays(date: dt.date, n: int, holidays: list[dt.date]) -> None: + assume(date.weekday() < 5) + assume(date not in holidays) + result = pl.DataFrame({'ts': [date]}).select(pl.col('ts').business.advance_n_days( + n=n, + holidays=holidays + ))['ts'].item() + expected = np.busday_offset(date, n, holidays=holidays) + assert np.datetime64(result) == expected