Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Holidays #2

Merged
merged 17 commits into from
Oct 13, 2023
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ Cargo.lock
target/
.hypothesis
*.pyc
.vscode
45 changes: 36 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,37 @@ $ pip install polars-business
To use it, you'll need to `import polars_business`, and then you'll be a `.business` accessor
on your expressions!

Currently there's only a single function: `advance_n_days`.
Currently there's only a single function: `advance_n_days`. It takes arguments:
- `n`: number of days to advance. This can be an expression.
- `holidays`: list of holidays in `datetime.date` format. The Python `holidays` package may
be useful here. You can install it with `pip install holidays`, and then you can get a list
of holidays for a given country with (for example, `'UK'`):
```
import holidays

list(holidays.country_holidays('UK', years=[2020, 2021, 2022, 2023]))
```

Example
-------

Here's an example of how to shift a date range forwards by 5 business days (i.e. Monday to Friday, excluding weekends):
Given some dates, can you shift them all forwards by 5 business days (according to the UK holiday calendar)?

With `polars-business`, this is easy:
```python
from datetime import date

import holidays
import polars as pl
import polars_business

from datetime import date

uk_holidays = holidays.country_holidays('UK', years=[2023, 2024])
df = pl.DataFrame({
"dates": pl.date_range(date(2000, 1, 1), date(9999, 1, 1), eager=True),
"dates": [date(2023, 4, 2), date(2023, 9, 1), date(2024, 1, 4)]
})
df = df.filter(pl.col('dates').dt.weekday() <6)

print(df.with_columns(dates_shifted=pl.col('dates').business.advance_n_days(n=5)))
print(df.with_columns(dates_shifted=pl.col('dates').business.advance_n_days(n=5, holidays=uk_holidays)))
```

Note
Expand All @@ -50,13 +63,27 @@ What to expected
----------------
The following will hopefully come relatively soon:
- support for `Datetime`s
- support for custom holiday calendars
- support for rolling forwards/backwards to the next
valid business date (if not already on one)

Ideas for future development:
- business date range
- support for custom mask
- support for custom week mask

Benchmarks
----------

The following timings can be verified using the `perf.py` script.

### Adding 17 business days to 10 million dates (no holidays)

- Polars-business 0.058
- NumPy 0.092
- pandas 0.801

### Adding 17 business days to 10 million dates (UK holidays for 2020-2023)

Currently there's only a single function: `advance_n_days`.
- Polars-business 0.406
- NumPy 0.417
- pandas: omitted as pandas doesn't (yet) vectorise `CustomBusinessDay`, so
we'd likely be talking about minutes
29 changes: 22 additions & 7 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,39 @@
SHELL=/bin/bash

venv: ## Set up virtual environment
python3 -m venv venv
venv/bin/pip install -r requirements.txt
. ../.venv/bin/activate

install: venv
unset CONDA_PREFIX && \
source venv/bin/activate && maturin develop -m polars_business/Cargo.toml
source ../.venv/bin/activate && maturin develop -m polars_business/Cargo.toml

install-release: venv
unset CONDA_PREFIX && \
source venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml
source ../.venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml

build: venv
unset CONDA_PREFIX && \
source ../.venv/bin/activate && maturin develop --release -m polars_business/Cargo.toml

clean:
-@rm -r venv
-@rm -r ../.venv
-@cd polars_business && cargo clean


run: install
source venv/bin/activate && python run.py
source ../.venv/bin/activate && python run.py

run-release: install-release
source venv/bin/activate && python run.py
source ../.venv/bin/activate && python run.py

test: build
pytest ../tests

clippy: venv
cargo clippy -p polars_business

fmt: venv
cargo fmt --all
black .

pre-commit: clippy fmt
75 changes: 75 additions & 0 deletions src/perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@

import timeit
import numpy as np

# BENCHMARK 1: NO HOLIDAYS INVOLVED

setup = """
import polars as pl
import polars_business
from datetime import date
import numpy as np
import pandas as pd
import holidays
import warnings

dates = pl.date_range(date(2020, 1, 1), date(2024, 1, 1), closed='left', eager=True)
dates = dates.filter(dates.dt.weekday() < 6)
size = 10_000_000
input_dates = np.random.choice(dates, size)

df = pl.DataFrame({
'ts': input_dates,
})

df_pd = pd.DataFrame({
'ts': input_dates,
})
"""

def time_it(statement):
results = np.array(timeit.Timer(
stmt=statement,
setup=setup,
)
.repeat(7, 3)
)/3
return round(min(results), 3)

print('Polars-business: ', time_it("result_pl = df.select(pl.col('ts').business.advance_n_days(n=17))"))

print('NumPy: ', time_it("result_np = np.busday_offset(input_dates, 17)"))

print('pandas: ', time_it("result_pd = df_pd['ts'] + pd.tseries.offsets.BusinessDay(17)"))

# BENCHMARK 2: WITH HOLIDAYS

setup = """
import polars as pl
import polars_business
from datetime import date
import numpy as np
import pandas as pd
import holidays
import warnings

uk_holidays = list(holidays.country_holidays('UK', years=[2020, 2021, 2022, 2023]))

dates = pl.date_range(date(2020, 1, 1), date(2024, 1, 1), closed='left', eager=True)
dates = dates.filter(~dates.is_in(uk_holidays))
dates = dates.filter(dates.dt.weekday() < 6)
size = 10_000_000
input_dates = np.random.choice(dates, size)

df = pl.DataFrame({
'ts': input_dates,
})

df_pd = pd.DataFrame({
'ts': input_dates,
})
"""

print('Polars-business: ', time_it("result_pl = df.select(pl.col('ts').business.advance_n_days(n=17, holidays=uk_holidays))"))

print('NumPy: ', time_it("result_np = np.busday_offset(input_dates, 17, holidays=uk_holidays)"))
26 changes: 17 additions & 9 deletions src/polars_business/polars_business/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,22 @@ class BusinessDayTools:
def __init__(self, expr: pl.Expr):
self._expr = expr.cast(pl.Int32)


def advance_n_days(self, n) -> pl.Expr:
def advance_n_days(self, n, holidays=None) -> pl.Expr:
# if not (isinstance(n, int) and n > 0):
# raise ValueError("only positive integers are currently supported for `n`")

return self._expr._register_plugin(
lib=lib,
symbol="advance_n_days",
is_elementwise=True,
args = [n],
)
if holidays is None:
return self._expr._register_plugin(
lib=lib,
symbol="advance_n_days",
is_elementwise=True,
args=[
n,
],
)
else:
return self._expr._register_plugin(
lib=lib,
symbol="advance_n_days",
is_elementwise=True,
args=[n, pl.Series([list(set(holidays))]).cast(pl.List(pl.Int32))],
)
Loading