diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7b8ee1d..ada9de8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -41,7 +41,7 @@ jobs: working-directory: polars_business - run: venv/bin/python -m pytest tests working-directory: polars_business - - run: venv/bin/python -m mypy . + - run: venv/bin/python -m mypy polars_business/polars_business/ tests working-directory: polars_business - name: Build wheels diff --git a/README.md b/README.md index 6a0beb6..7dbc5a2 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Supported functions are: - `holidays` argument, for passing custom holidays - `weekend` argument, for passing custom a weekend (default is ('Sat', 'Sun')) - `plb.datetime_range`: same as above, but the output will be `Datetime` dtype. +- `Expr.bdt.sub`: subtract two `Date`s and count the number of business dates between them! See `Examples` below! @@ -74,7 +75,10 @@ Let's shift `Date` forwards by 5 days, excluding Saturday and Sunday: ```python result = df.with_columns( - date_shifted=plb.col("date").bdt.offset_by('5bd') + date_shifted=plb.col("date").bdt.offset_by( + '5bd', + weekend=('Sat', 'Sun'), + ) ) print(result) ``` @@ -91,17 +95,18 @@ shape: (3, 2) └────────────┴──────────────┘ ``` -Let's shift `Date` forwards by 5 days, excluding Saturday and Sunday and UK holidays +Let's shift `Date` forwards by 5 days, excluding Friday, Saturday, and England holidays for 2023 and 2024: ```python import holidays -uk_holidays = holidays.country_holidays("UK", years=[2023, 2024]) +uk_holidays = holidays.country_holidays("UK", subdiv='England', years=[2023, 2024]) result = df.with_columns( date_shifted=plb.col("date").bdt.offset_by( by='5bd', + weekend=('Sat', 'Sun'), holidays=uk_holidays, ) ) @@ -114,33 +119,34 @@ shape: (3, 2) │ --- ┆ --- │ │ date ┆ date │ ╞════════════╪══════════════╡ -│ 2023-04-03 ┆ 2023-04-11 │ +│ 2023-04-03 ┆ 2023-04-12 │ │ 2023-09-01 ┆ 2023-09-08 │ │ 2024-01-04 ┆ 2024-01-11 │ └────────────┴──────────────┘ ``` -Let's shift `Date` forwards by 5 days, excluding only Sunday: +Count the number of business dates between two columns: ```python -result = df.with_columns( - date_shifted=plb.col("date").bdt.offset_by( - by='5bd', - weekend=['Sun'], - ) +df = pl.DataFrame( + { + "start": [date(2023, 1, 4), date(2023, 5, 1), date(2023, 9, 9)], + "end": [date(2023, 2, 8), date(2023, 5, 2), date(2023, 12, 30)], + } ) +result = df.with_columns(n_business_days=plb.col("end").bdt.sub("start")) print(result) ``` ``` -shape: (3, 2) -┌────────────┬──────────────┐ -│ date ┆ date_shifted │ -│ --- ┆ --- │ -│ date ┆ date │ -╞════════════╪══════════════╡ -│ 2023-04-03 ┆ 2023-04-08 │ -│ 2023-09-01 ┆ 2023-09-07 │ -│ 2024-01-04 ┆ 2024-01-10 │ -└────────────┴──────────────┘ +shape: (3, 3) +┌────────────┬────────────┬─────────────────┐ +│ start ┆ end ┆ n_business_days │ +│ --- ┆ --- ┆ --- │ +│ date ┆ date ┆ i32 │ +╞════════════╪════════════╪═════════════════╡ +│ 2023-01-04 ┆ 2023-02-08 ┆ 25 │ +│ 2023-05-01 ┆ 2023-05-02 ┆ 1 │ +│ 2023-09-09 ┆ 2023-12-30 ┆ 80 │ +└────────────┴────────────┴─────────────────┘ ``` Benchmarks @@ -150,6 +156,7 @@ Single-threaded performance is: - about on par with NumPy - at least an order of magnitude faster than pandas. -but note that Polars will take care of parallelisation for you. +but note that Polars will take care of parallelisation for you, and that this plugin +will fit in with Polars lazy execution. Check out https://www.kaggle.com/code/marcogorelli/polars-business for some comparisons. diff --git a/polars_business/Makefile b/polars_business/Makefile index 818396f..aaaad0b 100644 --- a/polars_business/Makefile +++ b/polars_business/Makefile @@ -13,6 +13,9 @@ install-release: venv unset CONDA_PREFIX && \ source venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml +pre-commit: venv + cargo fmt --all --manifest-path polars_business/Cargo.toml && cargo clippy --all-features --manifest-path polars_business/Cargo.toml + clean: -@rm -r venv -@cd expression_lib && cargo clean diff --git a/polars_business/bump_version.py b/polars_business/bump_version.py index a8e77d4..ec66811 100644 --- a/polars_business/bump_version.py +++ b/polars_business/bump_version.py @@ -6,6 +6,7 @@ how = sys.argv[1] subprocess.run(["cp", "../README.md", "polars_business/README.md"]) +subprocess.run(["cp", "../LICENSE", "polars_business/LICENSE"]) with open("polars_business/pyproject.toml", "r", encoding="utf-8") as f: content = f.read() diff --git a/polars_business/polars_business/LICENSE b/polars_business/polars_business/LICENSE new file mode 100644 index 0000000..cf54ffb --- /dev/null +++ b/polars_business/polars_business/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Marco Edward Gorelli + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/polars_business/polars_business/polars_business/__init__.py b/polars_business/polars_business/polars_business/__init__.py index dbfd59d..43466ae 100644 --- a/polars_business/polars_business/polars_business/__init__.py +++ b/polars_business/polars_business/polars_business/__init__.py @@ -104,6 +104,31 @@ def offset_by( return result return result.dt.offset_by(by) + def sub( + self, + end_dates: str | pl.Expr, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, + ) -> pl.Expr: + if weekend != ("Sat", "Sun"): + raise NotImplementedError( + "custom weekends are not yet supported - coming soon!" + ) + if holidays: + raise NotImplementedError( + "custom holidays are not yet supported - coming soon!" + ) + if isinstance(end_dates, str): + end_dates = pl.col(end_dates) + result = self._expr._register_plugin( + lib=lib, + symbol="sub", + is_elementwise=True, + args=[end_dates], + ) + return result + class BExpr(pl.Expr): @property diff --git a/polars_business/polars_business/pyproject.toml b/polars_business/polars_business/pyproject.toml index bfa89fc..54930b8 100644 --- a/polars_business/polars_business/pyproject.toml +++ b/polars_business/polars_business/pyproject.toml @@ -4,15 +4,16 @@ build-backend = "maturin" [project] name = "polars-business" -requires-python = ">=3.8" +description = "Business day utilities for Polars" +readme = "README.md" +authors = [ + { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, +] +license = { file = "LICENSE" } classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] version = "0.1.29" -authors = [ - { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, -] -description = "Business day utilities for Polars" -readme = "README.md" +requires-python = ">=3.8" diff --git a/polars_business/polars_business/src/expressions.rs b/polars_business/polars_business/src/expressions.rs index 9df8630..06c5a16 100644 --- a/polars_business/polars_business/src/expressions.rs +++ b/polars_business/polars_business/src/expressions.rs @@ -1,4 +1,5 @@ use crate::business_days::*; +use crate::sub::*; use polars::prelude::*; use pyo3_polars::derive::polars_expr; use serde::Deserialize; @@ -23,3 +24,10 @@ fn advance_n_days(inputs: &[Series], kwargs: BusinessDayKwargs) -> PolarsResult< impl_advance_n_days(s, n, holidays, weekend) } + +#[polars_expr(output_type=Int32)] +fn sub(inputs: &[Series]) -> PolarsResult { + let begin_dates = &inputs[0]; + let end_dates = &inputs[1]; + impl_sub(begin_dates, end_dates) +} diff --git a/polars_business/polars_business/src/lib.rs b/polars_business/polars_business/src/lib.rs index bc3a4e2..98da7d0 100644 --- a/polars_business/polars_business/src/lib.rs +++ b/polars_business/polars_business/src/lib.rs @@ -1,5 +1,6 @@ mod business_days; mod expressions; +mod sub; #[cfg(target_os = "linux")] use jemallocator::Jemalloc; diff --git a/polars_business/polars_business/src/sub.rs b/polars_business/polars_business/src/sub.rs new file mode 100644 index 0000000..4539d26 --- /dev/null +++ b/polars_business/polars_business/src/sub.rs @@ -0,0 +1,79 @@ +use crate::business_days::weekday; +use polars::prelude::arity::binary_elementwise; +use polars::prelude::*; + +fn date_diff(mut start_date: i32, mut end_date: i32) -> i32 { + let swapped = start_date > end_date; + if swapped { + (start_date, end_date) = (end_date, start_date); + start_date += 1; + end_date += 1; + } + + let mut start_weekday = weekday(start_date); + let end_weekday = weekday(end_date); + + if start_weekday == 6 { + start_date += 2; + start_weekday = 1; + } else if start_weekday == 7 { + start_date += 1; + start_weekday = 1; + } + if end_weekday == 6 { + end_date += 2; + } else if end_weekday == 7 { + end_date += 1; + } + + let diff = end_date - start_date; + + let whole_weeks = diff / 7; + let mut count = 0; + count += whole_weeks * 5; + start_date += whole_weeks * 7; + while start_date < end_date { + if start_weekday < 6 { + count += 1; + } + start_date += 1; + start_weekday += 1; + if start_weekday > 7 { + start_weekday = 1; + } + } + if swapped { + -count + } else { + count + } +} + +pub(crate) fn impl_sub( + end_dates: &Series, + start_dates: &Series, + // holidays: Vec, + // weekend: Vec, +) -> PolarsResult { + if (start_dates.dtype() != &DataType::Date) || (end_dates.dtype() != &DataType::Date) { + polars_bail!(InvalidOperation: "polars_business sub only works on Date type. Please cast to Date first."); + } + let start_dates = start_dates.date()?; + let end_dates = end_dates.date()?; + let out = match end_dates.len() { + 1 => { + if let Some(end_date) = end_dates.get(0) { + start_dates.apply(|x_date| x_date.map(|start_date| date_diff(start_date, end_date))) + } else { + Int32Chunked::full_null(start_dates.name(), start_dates.len()) + } + } + _ => binary_elementwise(start_dates, end_dates, |opt_s, opt_n| { + match (opt_s, opt_n) { + (Some(start_date), Some(end_date)) => Some(date_diff(start_date, end_date)), + _ => None, + } + }), + }; + Ok(out.into_series()) +} diff --git a/polars_business/sub_perf.py b/polars_business/sub_perf.py new file mode 100644 index 0000000..90e0767 --- /dev/null +++ b/polars_business/sub_perf.py @@ -0,0 +1,51 @@ +# type: ignore +import timeit +import warnings +import numpy as np + +BENCHMARKS = [1, 2, 3, 4] + +SIZE = 1_000_000 + +# BENCHMARK 1: NO HOLIDAYS INVOLVED + +setup = f""" +import polars as pl +import polars_business as plb +from datetime import date +import numpy as np +import pandas as pd +import holidays +import warnings + +dates = pl.date_range(date(2020, 1, 1), date(2024, 1, 1), closed='left', eager=True) +size = {SIZE} +start_dates = np.random.choice(dates, size) +end_dates = np.random.choice(dates, size) + +df = pl.DataFrame({{ + 'start_date': start_dates, + 'end_date': end_dates, +}}) +""" + + +def time_it(statement): + results = ( + np.array( + timeit.Timer( + stmt=statement, + setup=setup, + ).repeat(7, 3) + ) + / 3 + ) + return round(min(results), 5) + + +if 1 in BENCHMARKS: + print( + "Polars-business: ", + time_it("result_pl = df.select(plb.col('end_date').bdt.sub('start_date'))"), + ) + print("NumPy: ", time_it("result_np = np.busday_count(start_dates, end_dates)")) diff --git a/polars_business/t.py b/polars_business/t.py index 0799aa2..73672c7 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -1,7 +1,20 @@ import polars as pl +import numpy as np import polars_business as plb from datetime import date -df = pl.DataFrame({"ts": [date(2020, 1, 1)]}) - -print(df.with_columns(ts_shifted=plb.col("ts").bdt.offset_by('3bd'))) +df = pl.DataFrame( + { + "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), + "end": [date(2020, 2, 3)] * 41, + } +) +with pl.Config(tbl_rows=100): + print( + df.with_columns( + start_weekday=pl.col("start").dt.weekday(), + end_weekday=pl.col("end").dt.weekday(), + result=plb.col("end").bdt.sub("start", weekend=("Fri",)), + result_np=pl.Series(np.busday_count(df["start"], df["end"])), + ) + ) diff --git a/polars_business/tests/test_business_offsets.py b/polars_business/tests/test_business_offsets.py index 2160614..90aa0b2 100644 --- a/polars_business/tests/test_business_offsets.py +++ b/polars_business/tests/test_business_offsets.py @@ -17,7 +17,10 @@ def get_result( - date: dt.date, dtype: PolarsDataType, by: str | pl.Series, **kwargs: Mapping[str, Any] + date: dt.date, + dtype: PolarsDataType, + by: str | pl.Series, + **kwargs: Mapping[str, Any], ) -> dt.date: if dtype == pl.Date: result = ( @@ -53,7 +56,12 @@ def get_result( ), function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) -def test_against_np_busday_offset(date: dt.date, n: int, dtype: PolarsDataType, function: Callable[[str], str | pl.Series]) -> None: +def test_against_np_busday_offset( + date: dt.date, + n: int, + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], +) -> None: # how to do this... # convert time zone of date assume(date.strftime("%a") not in ("Sat", "Sun")) @@ -97,7 +105,11 @@ def test_against_pandas_bday_offset(date: dt.date, n: int) -> None: function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) def test_against_np_busday_offset_with_holidays( - date: dt.date, n: int, holidays: list[dt.date], dtype: PolarsDataType, function: Callable[[str], str | pl.Series] + date: dt.date, + n: int, + holidays: list[dt.date], + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], ) -> None: assume(date.strftime("%a") not in ("Sat", "Sun")) assume(date not in holidays) # TODO: remove once unwrap is removed @@ -125,7 +137,11 @@ def test_against_np_busday_offset_with_holidays( function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) def test_against_np_busday_offset_with_weekends( - date: dt.date, n: int, weekend: list[dt.date], dtype: PolarsDataType, function: Callable[[str], str | pl.Series] + date: dt.date, + n: int, + weekend: list[dt.date], + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], ) -> None: assume(date.strftime("%a") not in weekend) result = get_result(date, dtype, by=function(f"{n}bd"), weekend=weekend) # type: ignore[arg-type] @@ -158,8 +174,12 @@ def test_against_np_busday_offset_with_weekends( function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) def test_against_np_busday_offset_with_weekends_and_holidays( - date: dt.date, n: int, weekend: list[str], holidays: list[dt.date], dtype: PolarsDataType, function: Callable[[str], str | pl.Series] - + date: dt.date, + n: int, + weekend: list[str], + holidays: list[dt.date], + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], ) -> None: assume(date.strftime("%a") not in weekend) assume(date not in holidays) diff --git a/polars_business/tests/test_sub.py b/polars_business/tests/test_sub.py new file mode 100644 index 0000000..1b2d156 --- /dev/null +++ b/polars_business/tests/test_sub.py @@ -0,0 +1,41 @@ +import datetime as dt +import pytest +import pandas as pd # type: ignore +from typing import Mapping, Any, Callable + +import hypothesis.strategies as st +import numpy as np +from hypothesis import given, assume, reject + +import polars as pl +import polars_business as plb +from polars.type_aliases import PolarsDataType + + +mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7} +reverse_mapping = {value: key for key, value in mapping.items()} + + +def get_result( + start_date: dt.date | pl.Series, end_date: dt.date, **kwargs: Mapping[str, Any] +) -> int: + return ( # type: ignore[no-any-return] + pl.DataFrame({"end_date": [end_date]}) + .select(n=plb.col("end_date").bdt.sub(start_date, **kwargs))["n"] # type: ignore[arg-type] + .item() + ) + + +@given( + start_date=st.dates(min_value=dt.date(1000, 1, 1), max_value=dt.date(9999, 12, 31)), + end_date=st.dates(min_value=dt.date(1000, 1, 1), max_value=dt.date(9999, 12, 31)), + function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), +) +def test_against_np_busday_offset( + start_date: dt.date, + end_date: dt.date, + function: Callable[[dt.date], dt.date | pl.Series], +) -> None: + result = get_result(function(start_date), end_date) + expected = np.busday_count(start_date, end_date) + assert result == expected