From c4f582b732691cc3d52163e9ffc021bfee0fbfa6 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:23:11 +0100 Subject: [PATCH 01/15] start adding sub --- .../polars_business/__init__.py | 17 +++++++ .../polars_business/src/expressions.rs | 8 ++++ polars_business/polars_business/src/lib.rs | 1 + polars_business/polars_business/src/sub.rs | 46 +++++++++++++++++++ polars_business/t.py | 17 ++++++- 5 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 polars_business/polars_business/src/sub.rs diff --git a/polars_business/polars_business/polars_business/__init__.py b/polars_business/polars_business/polars_business/__init__.py index dbfd59d..9dfd289 100644 --- a/polars_business/polars_business/polars_business/__init__.py +++ b/polars_business/polars_business/polars_business/__init__.py @@ -104,6 +104,23 @@ def offset_by( return result return result.dt.offset_by(by) + def sub( + self, + end_dates: str | pl.Expr, + # *, + # weekend: Sequence[str] = ("Sat", "Sun"), + # holidays: Sequence[date] | None = None, + ) -> pl.Expr: + if isinstance(end_dates, str): + end_dates = pl.col(end_dates) + result = self._expr._register_plugin( + lib=lib, + symbol="sub", + is_elementwise=True, + args=[end_dates], + ) + return result + class BExpr(pl.Expr): @property diff --git a/polars_business/polars_business/src/expressions.rs b/polars_business/polars_business/src/expressions.rs index 9df8630..06c5a16 100644 --- a/polars_business/polars_business/src/expressions.rs +++ b/polars_business/polars_business/src/expressions.rs @@ -1,4 +1,5 @@ use crate::business_days::*; +use crate::sub::*; use polars::prelude::*; use pyo3_polars::derive::polars_expr; use serde::Deserialize; @@ -23,3 +24,10 @@ fn advance_n_days(inputs: &[Series], kwargs: BusinessDayKwargs) -> PolarsResult< impl_advance_n_days(s, n, holidays, weekend) } + +#[polars_expr(output_type=Int32)] +fn sub(inputs: &[Series]) -> PolarsResult { + let begin_dates = &inputs[0]; + let end_dates = &inputs[1]; + impl_sub(begin_dates, end_dates) +} diff --git a/polars_business/polars_business/src/lib.rs b/polars_business/polars_business/src/lib.rs index bc3a4e2..98da7d0 100644 --- a/polars_business/polars_business/src/lib.rs +++ b/polars_business/polars_business/src/lib.rs @@ -1,5 +1,6 @@ mod business_days; mod expressions; +mod sub; #[cfg(target_os = "linux")] use jemallocator::Jemalloc; diff --git a/polars_business/polars_business/src/sub.rs b/polars_business/polars_business/src/sub.rs new file mode 100644 index 0000000..9811cd7 --- /dev/null +++ b/polars_business/polars_business/src/sub.rs @@ -0,0 +1,46 @@ +use ahash::AHashMap; +use chrono::NaiveDateTime; +use polars::prelude::arity::binary_elementwise; +use polars::prelude::*; +use crate::business_days::weekday; + +pub(crate) fn impl_sub( + end_dates: &Series, + start_dates: &Series, + // holidays: Vec, + // weekend: Vec, +) -> PolarsResult { + // todo: raise if either is not Date? + let start_dates = start_dates.date()?; + let end_dates = end_dates.date()?; + let out = match end_dates.len() { + 1 => { + if let Some(end_date) = end_dates.get(0) { + start_dates.apply(|x_date| { + // want to do: + // result=floor(row_number/6) + // result_np=min(floor(row_number/6),5)×6result_np=min(floor(row_number/6),5)×6 + x_date.map(|x_date| { + end_date - ((x_date - 1)/5)*2 + }) + }) + } else { + Int32Chunked::full_null(start_dates.name(), start_dates.len()) + } + } + _ => binary_elementwise(start_dates, &end_dates, |opt_s, opt_n| match (opt_s, opt_n) { + (Some(start_date), Some(end_date)) => { + let end_weekday = weekday(end_date); + let end_date = if end_weekday == 7 { + end_date - 1 + } else { + end_date + }; + let result = end_date - start_date; + Some(result - ((result)/7)*2) + } + _ => None, + }), + }; + Ok(out.into_series()) +} \ No newline at end of file diff --git a/polars_business/t.py b/polars_business/t.py index 0799aa2..8b7f71d 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -1,7 +1,20 @@ import polars as pl +import numpy as np import polars_business as plb from datetime import date -df = pl.DataFrame({"ts": [date(2020, 1, 1)]}) +# ok, let's just do it as expression for now +# get something working, can always change the api later if necessary +# but really, nobody don't give no shit -print(df.with_columns(ts_shifted=plb.col("ts").bdt.offset_by('3bd'))) +df = pl.DataFrame({ + "start": [date(2019, 12, 30)]*41, + "end": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), +}) +with pl.Config(tbl_rows=100): + print(df.with_columns( + start_weekday=pl.col('start').dt.weekday(), + end_weekday=pl.col('end').dt.weekday(), + result=plb.col('end').bdt.sub('start'), + result_np = pl.Series(np.busday_count(df['start'], df['end'])) + )) From b4919ec14484cec662459e2ec06542f1849698a9 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:24:32 +0100 Subject: [PATCH 02/15] start adding sub --- polars_business/t.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polars_business/t.py b/polars_business/t.py index 8b7f71d..5bc2ed0 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -8,8 +8,8 @@ # but really, nobody don't give no shit df = pl.DataFrame({ - "start": [date(2019, 12, 30)]*41, - "end": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), + "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), + "end": [date(2020, 2, 1)]*42, }) with pl.Config(tbl_rows=100): print(df.with_columns( From 79eab842ce59f51b0b6eea55af3427529541b7e4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:34:50 +0100 Subject: [PATCH 03/15] wip (not there yet) --- polars_business/polars_business/src/sub.rs | 23 ++++++++++++++++------ polars_business/t.py | 2 +- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/polars_business/polars_business/src/sub.rs b/polars_business/polars_business/src/sub.rs index 9811cd7..78f7434 100644 --- a/polars_business/polars_business/src/sub.rs +++ b/polars_business/polars_business/src/sub.rs @@ -29,15 +29,26 @@ pub(crate) fn impl_sub( } } _ => binary_elementwise(start_dates, &end_dates, |opt_s, opt_n| match (opt_s, opt_n) { - (Some(start_date), Some(end_date)) => { + (Some(mut start_date), Some(mut end_date)) => { + let swapped = start_date > end_date; + if swapped { + (start_date, end_date) = (end_date, start_date); + } + let start_weekday = weekday(start_date); let end_weekday = weekday(end_date); - let end_date = if end_weekday == 7 { - end_date - 1 - } else { - end_date + if end_weekday == 7 { + end_date -= 1 + }; + if start_weekday == 7 { + start_date += 1 }; let result = end_date - start_date; - Some(result - ((result)/7)*2) + let result = result - (result/7)*2; + if swapped { + Some(-result) + } else { + Some(result) + } } _ => None, }), diff --git a/polars_business/t.py b/polars_business/t.py index 5bc2ed0..8649170 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -9,7 +9,7 @@ df = pl.DataFrame({ "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), - "end": [date(2020, 2, 1)]*42, + "end": [date(2020, 2, 1)]*41, }) with pl.Config(tbl_rows=100): print(df.with_columns( From dad91586f7e7085a1aefb66fec62add24f115c5f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:32:51 +0100 Subject: [PATCH 04/15] getting there --- polars_business/polars_business/src/sub.rs | 60 +++++++++++++++++----- polars_business/t.py | 2 +- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/polars_business/polars_business/src/sub.rs b/polars_business/polars_business/src/sub.rs index 78f7434..c0837d4 100644 --- a/polars_business/polars_business/src/sub.rs +++ b/polars_business/polars_business/src/sub.rs @@ -30,24 +30,60 @@ pub(crate) fn impl_sub( } _ => binary_elementwise(start_dates, &end_dates, |opt_s, opt_n| match (opt_s, opt_n) { (Some(mut start_date), Some(mut end_date)) => { + println!("***"); let swapped = start_date > end_date; if swapped { (start_date, end_date) = (end_date, start_date); + start_date += 1; + end_date += 1; + } + + println!("start: {:?}", start_date); + println!("end: {:?}", end_date); + + let mut start_weekday = weekday(start_date); + let mut end_weekday = weekday(end_date); + println!("start weekday: {:?}", start_weekday); + println!("end weekday: {:?}", end_weekday); + + if start_weekday == 6 { + start_date += 2; + start_weekday = 1; + } else if start_weekday == 7 { + start_date += 1; + start_weekday = 1; + } + if end_weekday == 6 { + end_date += 2; + end_weekday = 1; + } else if end_weekday == 7 { + end_date += 1; + end_weekday = 1; + } + + println!("start: {:?}", start_date); + println!("end: {:?}", end_date); + + let diff = end_date - start_date; + + let whole_weeks = diff / 7; + let mut count = 0; + count += whole_weeks * 5; + start_date += whole_weeks * 7; + while start_date < end_date { + if start_weekday < 6 { + count += 1; + } + start_date += 1; + start_weekday += 1; + if start_weekday > 7 { + start_weekday = 1; + } } - let start_weekday = weekday(start_date); - let end_weekday = weekday(end_date); - if end_weekday == 7 { - end_date -= 1 - }; - if start_weekday == 7 { - start_date += 1 - }; - let result = end_date - start_date; - let result = result - (result/7)*2; if swapped { - Some(-result) + Some(-count) } else { - Some(result) + Some(count) } } _ => None, diff --git a/polars_business/t.py b/polars_business/t.py index 8649170..2cc9cc7 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -9,7 +9,7 @@ df = pl.DataFrame({ "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), - "end": [date(2020, 2, 1)]*41, + "end": [date(2020, 2, 3)]*41, }) with pl.Config(tbl_rows=100): print(df.with_columns( From 3cfc21d257beb8031405db6e1b372c4b77df60f5 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:47:04 +0100 Subject: [PATCH 05/15] got it working! --- .../polars_business/__init__.py | 10 +- polars_business/polars_business/src/sub.rs | 118 ++++++++---------- polars_business/sub_perf.py | 51 ++++++++ polars_business/tests/test_sub.py | 37 ++++++ 4 files changed, 149 insertions(+), 67 deletions(-) create mode 100644 polars_business/sub_perf.py create mode 100644 polars_business/tests/test_sub.py diff --git a/polars_business/polars_business/polars_business/__init__.py b/polars_business/polars_business/polars_business/__init__.py index 9dfd289..6652819 100644 --- a/polars_business/polars_business/polars_business/__init__.py +++ b/polars_business/polars_business/polars_business/__init__.py @@ -107,10 +107,14 @@ def offset_by( def sub( self, end_dates: str | pl.Expr, - # *, - # weekend: Sequence[str] = ("Sat", "Sun"), - # holidays: Sequence[date] | None = None, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, ) -> pl.Expr: + if weekend != ('Sat', 'Sun'): + raise NotImplementedError("customer weekends are not yet supported - coming soon!") + if holidays: + raise NotImplementedError("customer holidays are not yet supported - coming soon!") if isinstance(end_dates, str): end_dates = pl.col(end_dates) result = self._expr._register_plugin( diff --git a/polars_business/polars_business/src/sub.rs b/polars_business/polars_business/src/sub.rs index c0837d4..ce297be 100644 --- a/polars_business/polars_business/src/sub.rs +++ b/polars_business/polars_business/src/sub.rs @@ -1,27 +1,71 @@ -use ahash::AHashMap; -use chrono::NaiveDateTime; use polars::prelude::arity::binary_elementwise; use polars::prelude::*; use crate::business_days::weekday; +fn date_diff(mut start_date: i32, mut end_date: i32) -> i32 { + let swapped = start_date > end_date; + if swapped { + (start_date, end_date) = (end_date, start_date); + start_date += 1; + end_date += 1; + } + + let mut start_weekday = weekday(start_date); + let end_weekday = weekday(end_date); + + if start_weekday == 6 { + start_date += 2; + start_weekday = 1; + } else if start_weekday == 7 { + start_date += 1; + start_weekday = 1; + } + if end_weekday == 6 { + end_date += 2; + } else if end_weekday == 7 { + end_date += 1; + } + + let diff = end_date - start_date; + + let whole_weeks = diff / 7; + let mut count = 0; + count += whole_weeks * 5; + start_date += whole_weeks * 7; + while start_date < end_date { + if start_weekday < 6 { + count += 1; + } + start_date += 1; + start_weekday += 1; + if start_weekday > 7 { + start_weekday = 1; + } + } + if swapped { + -count + } else { + count + } +} + pub(crate) fn impl_sub( end_dates: &Series, start_dates: &Series, // holidays: Vec, // weekend: Vec, ) -> PolarsResult { - // todo: raise if either is not Date? + if (start_dates.dtype() != &DataType::Date) || (end_dates.dtype() != &DataType::Date) { + polars_bail!(InvalidOperation: "polars_business sub only works on Date type. Please cast to Date first."); + } let start_dates = start_dates.date()?; let end_dates = end_dates.date()?; let out = match end_dates.len() { 1 => { if let Some(end_date) = end_dates.get(0) { start_dates.apply(|x_date| { - // want to do: - // result=floor(row_number/6) - // result_np=min(floor(row_number/6),5)×6result_np=min(floor(row_number/6),5)×6 - x_date.map(|x_date| { - end_date - ((x_date - 1)/5)*2 + x_date.map(|start_date| { + date_diff(start_date, end_date) }) }) } else { @@ -29,62 +73,8 @@ pub(crate) fn impl_sub( } } _ => binary_elementwise(start_dates, &end_dates, |opt_s, opt_n| match (opt_s, opt_n) { - (Some(mut start_date), Some(mut end_date)) => { - println!("***"); - let swapped = start_date > end_date; - if swapped { - (start_date, end_date) = (end_date, start_date); - start_date += 1; - end_date += 1; - } - - println!("start: {:?}", start_date); - println!("end: {:?}", end_date); - - let mut start_weekday = weekday(start_date); - let mut end_weekday = weekday(end_date); - println!("start weekday: {:?}", start_weekday); - println!("end weekday: {:?}", end_weekday); - - if start_weekday == 6 { - start_date += 2; - start_weekday = 1; - } else if start_weekday == 7 { - start_date += 1; - start_weekday = 1; - } - if end_weekday == 6 { - end_date += 2; - end_weekday = 1; - } else if end_weekday == 7 { - end_date += 1; - end_weekday = 1; - } - - println!("start: {:?}", start_date); - println!("end: {:?}", end_date); - - let diff = end_date - start_date; - - let whole_weeks = diff / 7; - let mut count = 0; - count += whole_weeks * 5; - start_date += whole_weeks * 7; - while start_date < end_date { - if start_weekday < 6 { - count += 1; - } - start_date += 1; - start_weekday += 1; - if start_weekday > 7 { - start_weekday = 1; - } - } - if swapped { - Some(-count) - } else { - Some(count) - } + (Some(start_date), Some(end_date)) => { + Some(date_diff(start_date, end_date)) } _ => None, }), diff --git a/polars_business/sub_perf.py b/polars_business/sub_perf.py new file mode 100644 index 0000000..90e0767 --- /dev/null +++ b/polars_business/sub_perf.py @@ -0,0 +1,51 @@ +# type: ignore +import timeit +import warnings +import numpy as np + +BENCHMARKS = [1, 2, 3, 4] + +SIZE = 1_000_000 + +# BENCHMARK 1: NO HOLIDAYS INVOLVED + +setup = f""" +import polars as pl +import polars_business as plb +from datetime import date +import numpy as np +import pandas as pd +import holidays +import warnings + +dates = pl.date_range(date(2020, 1, 1), date(2024, 1, 1), closed='left', eager=True) +size = {SIZE} +start_dates = np.random.choice(dates, size) +end_dates = np.random.choice(dates, size) + +df = pl.DataFrame({{ + 'start_date': start_dates, + 'end_date': end_dates, +}}) +""" + + +def time_it(statement): + results = ( + np.array( + timeit.Timer( + stmt=statement, + setup=setup, + ).repeat(7, 3) + ) + / 3 + ) + return round(min(results), 5) + + +if 1 in BENCHMARKS: + print( + "Polars-business: ", + time_it("result_pl = df.select(plb.col('end_date').bdt.sub('start_date'))"), + ) + print("NumPy: ", time_it("result_np = np.busday_count(start_dates, end_dates)")) diff --git a/polars_business/tests/test_sub.py b/polars_business/tests/test_sub.py new file mode 100644 index 0000000..d467572 --- /dev/null +++ b/polars_business/tests/test_sub.py @@ -0,0 +1,37 @@ +import datetime as dt +import pytest +import pandas as pd # type: ignore +from typing import Mapping, Any, Callable + +import hypothesis.strategies as st +import numpy as np +from hypothesis import given, assume, reject + +import polars as pl +import polars_business as plb +from polars.type_aliases import PolarsDataType + + +mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7} +reverse_mapping = {value: key for key, value in mapping.items()} + + +def get_result( + start_date: dt.date | pl.Series, end_date: dt.date, **kwargs: Mapping[str, Any] +) -> int: + return ( + pl.DataFrame({"end_date": [end_date]}) + .select(n=plb.col("end_date").bdt.sub(start_date, **kwargs))["n"] # type: ignore[arg-type] + .item() + ) + + +@given( + start_date=st.dates(min_value=dt.date(1000, 1, 1), max_value=dt.date(9999, 12, 31)), + end_date=st.dates(min_value=dt.date(1000, 1, 1), max_value=dt.date(9999, 12, 31)), + function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), +) +def test_against_np_busday_offset(start_date: dt.date, end_date: dt.date, function: Callable[[dt.date], dt.date| pl.Series]) -> None: + result = get_result(function(start_date), end_date) + expected = np.busday_count(start_date, end_date) + assert result == expected From 9940444754e3a34aaa6903ef60cee28be5e23424 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:54:07 +0100 Subject: [PATCH 06/15] add readme example --- README.md | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 6a0beb6..e767235 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Supported functions are: - `holidays` argument, for passing custom holidays - `weekend` argument, for passing custom a weekend (default is ('Sat', 'Sun')) - `plb.datetime_range`: same as above, but the output will be `Datetime` dtype. +- `plb.sub`: subtract two `Date`s and count the number of business dates between them! See `Examples` below! @@ -74,7 +75,10 @@ Let's shift `Date` forwards by 5 days, excluding Saturday and Sunday: ```python result = df.with_columns( - date_shifted=plb.col("date").bdt.offset_by('5bd') + date_shifted=plb.col("date").bdt.offset_by( + '5bd', + weekend=('Sat', 'Sun'), + ) ) print(result) ``` @@ -91,17 +95,18 @@ shape: (3, 2) └────────────┴──────────────┘ ``` -Let's shift `Date` forwards by 5 days, excluding Saturday and Sunday and UK holidays +Let's shift `Date` forwards by 5 days, excluding Friday, Saturday, and England holidays for 2023 and 2024: ```python import holidays -uk_holidays = holidays.country_holidays("UK", years=[2023, 2024]) +uk_holidays = holidays.country_holidays("UK", subdiv='England', years=[2023, 2024]) result = df.with_columns( date_shifted=plb.col("date").bdt.offset_by( by='5bd', + weekend=('Sat', 'Sun'), holidays=uk_holidays, ) ) @@ -114,33 +119,34 @@ shape: (3, 2) │ --- ┆ --- │ │ date ┆ date │ ╞════════════╪══════════════╡ -│ 2023-04-03 ┆ 2023-04-11 │ +│ 2023-04-03 ┆ 2023-04-12 │ │ 2023-09-01 ┆ 2023-09-08 │ │ 2024-01-04 ┆ 2024-01-11 │ └────────────┴──────────────┘ ``` -Let's shift `Date` forwards by 5 days, excluding only Sunday: +Count the number of business dates between two columns: ```python -result = df.with_columns( - date_shifted=plb.col("date").bdt.offset_by( - by='5bd', - weekend=['Sun'], - ) +df = pl.DataFrame( + { + "start": [date(2023, 1, 4), date(2023, 5, 1), date(2023, 9, 9)], + "end": [date(2023, 2, 8), date(2023, 5, 2), date(2023, 12, 30)], + } ) +result = df.with_columns(n_business_days=plb.col("end").bdt.sub("start")) print(result) ``` ``` -shape: (3, 2) -┌────────────┬──────────────┐ -│ date ┆ date_shifted │ -│ --- ┆ --- │ -│ date ┆ date │ -╞════════════╪══════════════╡ -│ 2023-04-03 ┆ 2023-04-08 │ -│ 2023-09-01 ┆ 2023-09-07 │ -│ 2024-01-04 ┆ 2024-01-10 │ -└────────────┴──────────────┘ +shape: (3, 3) +┌────────────┬────────────┬─────────────────┐ +│ start ┆ end ┆ n_business_days │ +│ --- ┆ --- ┆ --- │ +│ date ┆ date ┆ i32 │ +╞════════════╪════════════╪═════════════════╡ +│ 2023-01-04 ┆ 2023-02-08 ┆ 25 │ +│ 2023-05-01 ┆ 2023-05-02 ┆ 1 │ +│ 2023-09-09 ┆ 2023-12-30 ┆ 80 │ +└────────────┴────────────┴─────────────────┘ ``` Benchmarks @@ -150,6 +156,7 @@ Single-threaded performance is: - about on par with NumPy - at least an order of magnitude faster than pandas. -but note that Polars will take care of parallelisation for you. +but note that Polars will take care of parallelisation for you, and that this plugin +will fit in with Polars lazy execution. Check out https://www.kaggle.com/code/marcogorelli/polars-business for some comparisons. From 2db34c727d09dfad664e4b341ac95aadaf617572 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:01:22 +0100 Subject: [PATCH 07/15] update build --- polars_business/bump_version.py | 1 + polars_business/polars_business/pyproject.toml | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/polars_business/bump_version.py b/polars_business/bump_version.py index a8e77d4..ec66811 100644 --- a/polars_business/bump_version.py +++ b/polars_business/bump_version.py @@ -6,6 +6,7 @@ how = sys.argv[1] subprocess.run(["cp", "../README.md", "polars_business/README.md"]) +subprocess.run(["cp", "../LICENSE", "polars_business/LICENSE"]) with open("polars_business/pyproject.toml", "r", encoding="utf-8") as f: content = f.read() diff --git a/polars_business/polars_business/pyproject.toml b/polars_business/polars_business/pyproject.toml index bfa89fc..54930b8 100644 --- a/polars_business/polars_business/pyproject.toml +++ b/polars_business/polars_business/pyproject.toml @@ -4,15 +4,16 @@ build-backend = "maturin" [project] name = "polars-business" -requires-python = ">=3.8" +description = "Business day utilities for Polars" +readme = "README.md" +authors = [ + { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, +] +license = { file = "LICENSE" } classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] version = "0.1.29" -authors = [ - { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, -] -description = "Business day utilities for Polars" -readme = "README.md" +requires-python = ">=3.8" From 4adcf6ae04440f7c9d28a6c99ebe84920e74f5ee Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:02:49 +0100 Subject: [PATCH 08/15] typo --- polars_business/polars_business/polars_business/__init__.py | 4 ++-- polars_business/t.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/polars_business/polars_business/polars_business/__init__.py b/polars_business/polars_business/polars_business/__init__.py index 6652819..bb0bfd6 100644 --- a/polars_business/polars_business/polars_business/__init__.py +++ b/polars_business/polars_business/polars_business/__init__.py @@ -112,9 +112,9 @@ def sub( holidays: Sequence[date] | None = None, ) -> pl.Expr: if weekend != ('Sat', 'Sun'): - raise NotImplementedError("customer weekends are not yet supported - coming soon!") + raise NotImplementedError("custom weekends are not yet supported - coming soon!") if holidays: - raise NotImplementedError("customer holidays are not yet supported - coming soon!") + raise NotImplementedError("custom holidays are not yet supported - coming soon!") if isinstance(end_dates, str): end_dates = pl.col(end_dates) result = self._expr._register_plugin( diff --git a/polars_business/t.py b/polars_business/t.py index 2cc9cc7..fb58275 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -15,6 +15,6 @@ print(df.with_columns( start_weekday=pl.col('start').dt.weekday(), end_weekday=pl.col('end').dt.weekday(), - result=plb.col('end').bdt.sub('start'), + result=plb.col('end').bdt.sub('start', weekend=('Fri',)), result_np = pl.Series(np.busday_count(df['start'], df['end'])) )) From 5aa8654adab1b010bcd41769f9f2f9273fa904f2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:05:20 +0100 Subject: [PATCH 09/15] typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e767235..7dbc5a2 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Supported functions are: - `holidays` argument, for passing custom holidays - `weekend` argument, for passing custom a weekend (default is ('Sat', 'Sun')) - `plb.datetime_range`: same as above, but the output will be `Datetime` dtype. -- `plb.sub`: subtract two `Date`s and count the number of business dates between them! +- `Expr.bdt.sub`: subtract two `Date`s and count the number of business dates between them! See `Examples` below! From 9b930ec1d76dca3e5a4f69505da0845fec86aa4e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:08:58 +0100 Subject: [PATCH 10/15] wip --- polars_business/t.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/polars_business/t.py b/polars_business/t.py index fb58275..c969825 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -3,10 +3,6 @@ import polars_business as plb from datetime import date -# ok, let's just do it as expression for now -# get something working, can always change the api later if necessary -# but really, nobody don't give no shit - df = pl.DataFrame({ "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), "end": [date(2020, 2, 3)]*41, From ab970b494cf3fd8f147fdf9b43c45c94863824e8 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:18:50 +0100 Subject: [PATCH 11/15] noop From 9c09ac6cac99a6dc9f5e11faa33eadd254295b70 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:27:24 +0100 Subject: [PATCH 12/15] noop From 3d4d5854d5fb94592340f5383209eca15513ce33 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:28:55 +0100 Subject: [PATCH 13/15] add license --- polars_business/LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 polars_business/LICENSE diff --git a/polars_business/LICENSE b/polars_business/LICENSE new file mode 100644 index 0000000..cf54ffb --- /dev/null +++ b/polars_business/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Marco Edward Gorelli + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 8b6eb1d4f088f490befa1b424923f291778b5338 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:33:33 +0100 Subject: [PATCH 14/15] :truck: --- polars_business/{ => polars_business}/LICENSE | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename polars_business/{ => polars_business}/LICENSE (100%) diff --git a/polars_business/LICENSE b/polars_business/polars_business/LICENSE similarity index 100% rename from polars_business/LICENSE rename to polars_business/polars_business/LICENSE From 2de7a149ca99c4edc203ea2f57008764ca84ed77 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:45:24 +0100 Subject: [PATCH 15/15] lints --- .github/workflows/CI.yml | 2 +- polars_business/Makefile | 3 ++ .../polars_business/__init__.py | 10 ++++-- polars_business/polars_business/src/sub.rs | 18 ++++------- polars_business/t.py | 24 ++++++++------ .../tests/test_business_offsets.py | 32 +++++++++++++++---- polars_business/tests/test_sub.py | 8 +++-- 7 files changed, 64 insertions(+), 33 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7b8ee1d..ada9de8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -41,7 +41,7 @@ jobs: working-directory: polars_business - run: venv/bin/python -m pytest tests working-directory: polars_business - - run: venv/bin/python -m mypy . + - run: venv/bin/python -m mypy polars_business/polars_business/ tests working-directory: polars_business - name: Build wheels diff --git a/polars_business/Makefile b/polars_business/Makefile index 818396f..aaaad0b 100644 --- a/polars_business/Makefile +++ b/polars_business/Makefile @@ -13,6 +13,9 @@ install-release: venv unset CONDA_PREFIX && \ source venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml +pre-commit: venv + cargo fmt --all --manifest-path polars_business/Cargo.toml && cargo clippy --all-features --manifest-path polars_business/Cargo.toml + clean: -@rm -r venv -@cd expression_lib && cargo clean diff --git a/polars_business/polars_business/polars_business/__init__.py b/polars_business/polars_business/polars_business/__init__.py index bb0bfd6..43466ae 100644 --- a/polars_business/polars_business/polars_business/__init__.py +++ b/polars_business/polars_business/polars_business/__init__.py @@ -111,10 +111,14 @@ def sub( weekend: Sequence[str] = ("Sat", "Sun"), holidays: Sequence[date] | None = None, ) -> pl.Expr: - if weekend != ('Sat', 'Sun'): - raise NotImplementedError("custom weekends are not yet supported - coming soon!") + if weekend != ("Sat", "Sun"): + raise NotImplementedError( + "custom weekends are not yet supported - coming soon!" + ) if holidays: - raise NotImplementedError("custom holidays are not yet supported - coming soon!") + raise NotImplementedError( + "custom holidays are not yet supported - coming soon!" + ) if isinstance(end_dates, str): end_dates = pl.col(end_dates) result = self._expr._register_plugin( diff --git a/polars_business/polars_business/src/sub.rs b/polars_business/polars_business/src/sub.rs index ce297be..4539d26 100644 --- a/polars_business/polars_business/src/sub.rs +++ b/polars_business/polars_business/src/sub.rs @@ -1,6 +1,6 @@ +use crate::business_days::weekday; use polars::prelude::arity::binary_elementwise; use polars::prelude::*; -use crate::business_days::weekday; fn date_diff(mut start_date: i32, mut end_date: i32) -> i32 { let swapped = start_date > end_date; @@ -63,21 +63,17 @@ pub(crate) fn impl_sub( let out = match end_dates.len() { 1 => { if let Some(end_date) = end_dates.get(0) { - start_dates.apply(|x_date| { - x_date.map(|start_date| { - date_diff(start_date, end_date) - }) - }) + start_dates.apply(|x_date| x_date.map(|start_date| date_diff(start_date, end_date))) } else { Int32Chunked::full_null(start_dates.name(), start_dates.len()) } } - _ => binary_elementwise(start_dates, &end_dates, |opt_s, opt_n| match (opt_s, opt_n) { - (Some(start_date), Some(end_date)) => { - Some(date_diff(start_date, end_date)) + _ => binary_elementwise(start_dates, end_dates, |opt_s, opt_n| { + match (opt_s, opt_n) { + (Some(start_date), Some(end_date)) => Some(date_diff(start_date, end_date)), + _ => None, } - _ => None, }), }; Ok(out.into_series()) -} \ No newline at end of file +} diff --git a/polars_business/t.py b/polars_business/t.py index c969825..73672c7 100644 --- a/polars_business/t.py +++ b/polars_business/t.py @@ -3,14 +3,18 @@ import polars_business as plb from datetime import date -df = pl.DataFrame({ - "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), - "end": [date(2020, 2, 3)]*41, -}) +df = pl.DataFrame( + { + "start": pl.date_range(date(2019, 12, 30), date(2020, 2, 8), eager=True), + "end": [date(2020, 2, 3)] * 41, + } +) with pl.Config(tbl_rows=100): - print(df.with_columns( - start_weekday=pl.col('start').dt.weekday(), - end_weekday=pl.col('end').dt.weekday(), - result=plb.col('end').bdt.sub('start', weekend=('Fri',)), - result_np = pl.Series(np.busday_count(df['start'], df['end'])) - )) + print( + df.with_columns( + start_weekday=pl.col("start").dt.weekday(), + end_weekday=pl.col("end").dt.weekday(), + result=plb.col("end").bdt.sub("start", weekend=("Fri",)), + result_np=pl.Series(np.busday_count(df["start"], df["end"])), + ) + ) diff --git a/polars_business/tests/test_business_offsets.py b/polars_business/tests/test_business_offsets.py index 2160614..90aa0b2 100644 --- a/polars_business/tests/test_business_offsets.py +++ b/polars_business/tests/test_business_offsets.py @@ -17,7 +17,10 @@ def get_result( - date: dt.date, dtype: PolarsDataType, by: str | pl.Series, **kwargs: Mapping[str, Any] + date: dt.date, + dtype: PolarsDataType, + by: str | pl.Series, + **kwargs: Mapping[str, Any], ) -> dt.date: if dtype == pl.Date: result = ( @@ -53,7 +56,12 @@ def get_result( ), function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) -def test_against_np_busday_offset(date: dt.date, n: int, dtype: PolarsDataType, function: Callable[[str], str | pl.Series]) -> None: +def test_against_np_busday_offset( + date: dt.date, + n: int, + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], +) -> None: # how to do this... # convert time zone of date assume(date.strftime("%a") not in ("Sat", "Sun")) @@ -97,7 +105,11 @@ def test_against_pandas_bday_offset(date: dt.date, n: int) -> None: function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) def test_against_np_busday_offset_with_holidays( - date: dt.date, n: int, holidays: list[dt.date], dtype: PolarsDataType, function: Callable[[str], str | pl.Series] + date: dt.date, + n: int, + holidays: list[dt.date], + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], ) -> None: assume(date.strftime("%a") not in ("Sat", "Sun")) assume(date not in holidays) # TODO: remove once unwrap is removed @@ -125,7 +137,11 @@ def test_against_np_busday_offset_with_holidays( function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) def test_against_np_busday_offset_with_weekends( - date: dt.date, n: int, weekend: list[dt.date], dtype: PolarsDataType, function: Callable[[str], str | pl.Series] + date: dt.date, + n: int, + weekend: list[dt.date], + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], ) -> None: assume(date.strftime("%a") not in weekend) result = get_result(date, dtype, by=function(f"{n}bd"), weekend=weekend) # type: ignore[arg-type] @@ -158,8 +174,12 @@ def test_against_np_busday_offset_with_weekends( function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) def test_against_np_busday_offset_with_weekends_and_holidays( - date: dt.date, n: int, weekend: list[str], holidays: list[dt.date], dtype: PolarsDataType, function: Callable[[str], str | pl.Series] - + date: dt.date, + n: int, + weekend: list[str], + holidays: list[dt.date], + dtype: PolarsDataType, + function: Callable[[str], str | pl.Series], ) -> None: assume(date.strftime("%a") not in weekend) assume(date not in holidays) diff --git a/polars_business/tests/test_sub.py b/polars_business/tests/test_sub.py index d467572..1b2d156 100644 --- a/polars_business/tests/test_sub.py +++ b/polars_business/tests/test_sub.py @@ -19,7 +19,7 @@ def get_result( start_date: dt.date | pl.Series, end_date: dt.date, **kwargs: Mapping[str, Any] ) -> int: - return ( + return ( # type: ignore[no-any-return] pl.DataFrame({"end_date": [end_date]}) .select(n=plb.col("end_date").bdt.sub(start_date, **kwargs))["n"] # type: ignore[arg-type] .item() @@ -31,7 +31,11 @@ def get_result( end_date=st.dates(min_value=dt.date(1000, 1, 1), max_value=dt.date(9999, 12, 31)), function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), ) -def test_against_np_busday_offset(start_date: dt.date, end_date: dt.date, function: Callable[[dt.date], dt.date| pl.Series]) -> None: +def test_against_np_busday_offset( + start_date: dt.date, + end_date: dt.date, + function: Callable[[dt.date], dt.date | pl.Series], +) -> None: result = get_result(function(start_date), end_date) expected = np.busday_count(start_date, end_date) assert result == expected