From c0c1e6933e505d4c908c79a8ee9115643a482089 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:06:48 +0000 Subject: [PATCH 1/8] wip (check lhs naming rule) --- polars_xdt/__init__.py | 1088 +++++++++++++++----------------- tests/ceil_test.py | 3 +- tests/julian_date_test.py | 4 +- tests/offsets_test.py | 101 --- tests/test_business_offsets.py | 20 +- tests/test_format_localized.py | 12 +- tests/test_is_busday.py | 2 +- tests/test_sub.py | 23 +- tests/test_timezone.py | 10 +- 9 files changed, 529 insertions(+), 734 deletions(-) delete mode 100644 tests/offsets_test.py diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 7ee926a..797de3b 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -20,7 +20,7 @@ from typing_extensions import TypeAlias if TYPE_CHECKING: - from polars.type_aliases import PolarsDataType + from polars.type_aliases import PolarsDataType, IntoExpr RollStrategy: TypeAlias = Literal["raise", "forward", "backward"] @@ -46,31 +46,24 @@ def get_weekmask(weekend: Sequence[str]) -> list[bool]: return weekmask -@pl.api.register_expr_namespace("xdt") -class ExprXDTNamespace: - """eXtra stuff for DateTimes.""" - - def __init__(self, expr: pl.Expr) -> None: - self._expr = expr +def offset_by( + expr: IntoExpr, + by: str | pl.Expr, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, + roll: RollStrategy = "raise", +) -> pl.Expr: + """Offset this date by a relative time offset. - def offset_by( - self, - by: str | pl.Expr, - *, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, - roll: RollStrategy = "raise", - ) -> XDTExpr: - """Offset this date by a relative time offset. - - Parameters - ---------- - by - The offset to apply. This can be a string of the form "nbd" (where n - is an integer), or a polars expression that evaluates to such a string. - Additional units are passed to `polars.dt.offset_by`. - weekend - The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). + Parameters + ---------- + by + The offset to apply. This can be a string of the form "nbd" (where n + is an integer), or a polars expression that evaluates to such a string. + Additional units are passed to `polars.dt.offset_by`. + weekend + The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). holidays The holidays to exclude from the calculation. Defaults to None. roll @@ -156,607 +149,522 @@ def offset_by( │ 2024-01-04 ┆ -3bd ┆ 2024-01-01 │ └────────────┴──────┴──────────────┘ """ - if ( - isinstance(by, str) - and (match := re.search(r"(\d+bd)", by)) is not None - and (len(match.group(1)) == len(by)) - ): - # Fast path - do we have a business day offset, and nothing else? - n: int | pl.Expr = int(by[:-2]) - fastpath = True - else: - if not isinstance(by, pl.Expr): - by = pl.lit(by) - n = (by.str.extract(r"^(-?)") + by.str.extract(r"(\d+)bd")).cast( - pl.Int32, - ) - by = by.str.replace(r"(\d+bd)", "") - fastpath = False - - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - weekmask = get_weekmask(weekend) - - result = self._expr.register_plugin( - lib=lib, - symbol="advance_n_days", - is_elementwise=True, - args=[n], - kwargs={ - "holidays": holidays_int, - "weekmask": weekmask, - "roll": roll, - }, + expr = wrap_expr(parse_as_expression(expr)) + if ( + isinstance(by, str) + and (match := re.search(r"(\d+bd)", by)) is not None + and (len(match.group(1)) == len(by)) + ): + # Fast path - do we have a business day offset, and nothing else? + n: int | pl.Expr = int(by[:-2]) + fastpath = True + else: + if not isinstance(by, pl.Expr): + by = pl.lit(by) + n = (by.str.extract(r"^(-?)") + by.str.extract(r"(\d+)bd")).cast( + pl.Int32, ) - if fastpath: - return cast(XDTExpr, result) - return cast(XDTExpr, result.dt.offset_by(by)) + by = by.str.replace(r"(\d+bd)", "") + fastpath = False - def sub( - self, - end_dates: str | pl.Expr, - *, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, - ) -> XDTExpr: - weekmask = get_weekmask(weekend) - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - if isinstance(end_dates, str): - end_dates = pl.col(end_dates) - result = self._expr.register_plugin( - lib=lib, - symbol="sub", - is_elementwise=True, - args=[end_dates], - kwargs={ - "weekmask": weekmask, - "holidays": holidays_int, - }, + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, ) - return cast(XDTExpr, result) - - def is_workday( - self, - *, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, - ) -> pl.Expr: - """Determine whether a day is a workday. - - Parameters - ---------- - weekend - The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). - holidays - The holidays to exclude from the calculation. Defaults to None. This should - be a list of ``datetime.date`` s. - - Returns - ------- - polars.Expr - - Examples - -------- - >>> from datetime import date - >>> import polars as pl - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date": [ - ... date(2023, 1, 4), - ... date(2023, 5, 1), - ... date(2023, 9, 9), - ... ], - ... } - ... ) - >>> df.with_columns(is_workday=pl.col("date").xdt.is_workday()) - shape: (3, 2) - ┌────────────┬────────────┐ - │ date ┆ is_workday │ - │ --- ┆ --- │ - │ date ┆ bool │ - ╞════════════╪════════════╡ - │ 2023-01-04 ┆ true │ - │ 2023-05-01 ┆ true │ - │ 2023-09-09 ┆ false │ - └────────────┴────────────┘ - """ - weekmask = get_weekmask(weekend) - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - return self._expr.register_plugin( - lib=lib, - symbol="is_workday", - is_elementwise=True, - args=[], - kwargs={ - "weekmask": weekmask, - "holidays": holidays_int, - }, + weekmask = get_weekmask(weekend) + + result = expr.register_plugin( + lib=lib, + symbol="advance_n_days", + is_elementwise=True, + args=[n], + kwargs={ + "holidays": holidays_int, + "weekmask": weekmask, + "roll": roll, + }, + ) + if fastpath: + return result + return result.dt.offset_by(by) + +def sub( + expr: pl.Expr, + end_dates: pl.Expr, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, +) -> pl.Expr: + weekmask = get_weekmask(weekend) + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, ) + return expr.register_plugin( + lib=lib, + symbol="sub", + is_elementwise=True, + args=[end_dates], + kwargs={ + "weekmask": weekmask, + "holidays": holidays_int, + }, + ) - def from_local_datetime( - self, - from_tz: str | Expr, - to_tz: str, - ambiguous: Ambiguous = "raise", - ) -> XDTExpr: - """Converts from local datetime in given time zone to new timezone. - - Parameters - ---------- - from_tz - Current timezone of each datetime - to_tz - Timezone to convert to - ambiguous - Determine how to deal with ambiguous datetimes: - - - `'raise'` (default): raise - - `'earliest'`: use the earliest datetime - - `'latest'`: use the latest datetime +def is_workday( + expr: str | pl.Expr, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, +) -> pl.Expr: + """Determine whether a day is a workday. - Returns - ------- - Expr - Expression of data type :class:`DateTime`. + Parameters + ---------- + weekend + The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). + holidays + The holidays to exclude from the calculation. Defaults to None. This should + be a list of ``datetime.date`` s. - Examples - -------- - You can go from a localized datetime back to expressing the datetimes - in a single timezone with `from_local_datetime`. + Returns + ------- + polars.Expr - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "local_dt": [ - ... datetime(2020, 10, 10, 1), - ... datetime(2020, 10, 10, 2), - ... datetime(2020, 10, 9, 20), - ... ], - ... "timezone": [ - ... "Europe/London", - ... "Africa/Kigali", - ... "America/New_York", - ... ], - ... } - ... ) - >>> df.with_columns( - ... pl.col("local_dt") - ... .xdt.from_local_datetime(pl.col("timezone"), "UTC") - ... .alias("date") - ... ) - shape: (3, 3) - ┌─────────────────────┬──────────────────┬─────────────────────────┐ - │ local_dt ┆ timezone ┆ date │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ datetime[μs, UTC] │ - ╞═════════════════════╪══════════════════╪═════════════════════════╡ - │ 2020-10-10 01:00:00 ┆ Europe/London ┆ 2020-10-10 00:00:00 UTC │ - │ 2020-10-10 02:00:00 ┆ Africa/Kigali ┆ 2020-10-10 00:00:00 UTC │ - │ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │ - └─────────────────────┴──────────────────┴─────────────────────────┘ - """ - from_tz = wrap_expr(parse_as_expression(from_tz, str_as_lit=True)) - result = self._expr.register_plugin( - lib=lib, - symbol="from_local_datetime", - is_elementwise=True, - args=[from_tz], - kwargs={ - "to_tz": to_tz, - "ambiguous": ambiguous, - }, + Examples + -------- + >>> from datetime import date + >>> import polars as pl + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "date": [ + ... date(2023, 1, 4), + ... date(2023, 5, 1), + ... date(2023, 9, 9), + ... ], + ... } + ... ) + >>> df.with_columns(is_workday=pl.col("date").xdt.is_workday()) + shape: (3, 2) + ┌────────────┬────────────┐ + │ date ┆ is_workday │ + │ --- ┆ --- │ + │ date ┆ bool │ + ╞════════════╪════════════╡ + │ 2023-01-04 ┆ true │ + │ 2023-05-01 ┆ true │ + │ 2023-09-09 ┆ false │ + └────────────┴────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + weekmask = get_weekmask(weekend) + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, ) - return cast(XDTExpr, result) - - def to_local_datetime( - self, - time_zone: str | Expr, - ) -> XDTExpr: - """Convert to local datetime in given time zone. + return expr.register_plugin( + lib=lib, + symbol="is_workday", + is_elementwise=True, + args=[], + kwargs={ + "weekmask": weekmask, + "holidays": holidays_int, + }, + ) - Parameters - ---------- - time_zone - Time zone to convert to. +def from_local_datetime( + expr: str | pl.Expr, + from_tz: str | Expr, + to_tz: str, + ambiguous: Ambiguous = "raise", +) -> pl.Expr: + """Converts from local datetime in given time zone to new timezone. - Returns - ------- - Expr - Expression of data type :class:`DateTime`. + Parameters + ---------- + from_tz + Current timezone of each datetime + to_tz + Timezone to convert to + ambiguous + Determine how to deal with ambiguous datetimes: - Examples - -------- - You can use `to_local_datetime` to figure out how a tz-aware datetime - will be expressed as a local datetime. + - `'raise'` (default): raise + - `'earliest'`: use the earliest datetime + - `'latest'`: use the latest datetime - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [datetime(2020, 10, 10)] * 3, - ... "timezone": [ - ... "Europe/London", - ... "Africa/Kigali", - ... "America/New_York", - ... ], - ... } - ... ).with_columns(pl.col("date_col").dt.replace_time_zone("UTC")) - >>> df.with_columns( - ... pl.col("date_col") - ... .xdt.to_local_datetime(pl.col("timezone")) - ... .alias("local_dt") - ... ) - shape: (3, 3) - ┌─────────────────────────┬──────────────────┬─────────────────────┐ - │ date_col ┆ timezone ┆ local_dt │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs, UTC] ┆ str ┆ datetime[μs] │ - ╞═════════════════════════╪══════════════════╪═════════════════════╡ - │ 2020-10-10 00:00:00 UTC ┆ Europe/London ┆ 2020-10-10 01:00:00 │ - │ 2020-10-10 00:00:00 UTC ┆ Africa/Kigali ┆ 2020-10-10 02:00:00 │ - │ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │ - └─────────────────────────┴──────────────────┴─────────────────────┘ - """ - time_zone = wrap_expr(parse_as_expression(time_zone, str_as_lit=True)) - result = self._expr.register_plugin( - lib=lib, - symbol="to_local_datetime", - is_elementwise=True, - args=[time_zone], - ) - return cast(XDTExpr, result) - - def format_localized( - self, - format: str, # noqa: A002 - locale: str = "uk_UA", - ) -> XDTExpr: - """Convert to local datetime in given time zone. + Returns + ------- + Expr + Expression of data type :class:`DateTime`. - Parameters - ---------- - format - Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html - for what's available. - locale - Locale to use for formatting. Defaults to "uk_UA", because that's what the OP - requested https://github.com/pola-rs/polars/issues/12341. + Examples + -------- + You can go from a localized datetime back to expressing the datetimes + in a single timezone with `from_local_datetime`. - Returns - ------- - Expr - Expression of data type :class:`Utf8`. + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "local_dt": [ + ... datetime(2020, 10, 10, 1), + ... datetime(2020, 10, 10, 2), + ... datetime(2020, 10, 9, 20), + ... ], + ... "timezone": [ + ... "Europe/London", + ... "Africa/Kigali", + ... "America/New_York", + ... ], + ... } + ... ) + >>> df.with_columns( + ... pl.col("local_dt") + ... .xdt.from_local_datetime(pl.col("timezone"), "UTC") + ... .alias("date") + ... ) + shape: (3, 3) + ┌─────────────────────┬──────────────────┬─────────────────────────┐ + │ local_dt ┆ timezone ┆ date │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ datetime[μs, UTC] │ + ╞═════════════════════╪══════════════════╪═════════════════════════╡ + │ 2020-10-10 01:00:00 ┆ Europe/London ┆ 2020-10-10 00:00:00 UTC │ + │ 2020-10-10 02:00:00 ┆ Africa/Kigali ┆ 2020-10-10 00:00:00 UTC │ + │ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │ + └─────────────────────┴──────────────────┴─────────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + from_tz = wrap_expr(parse_as_expression(from_tz, str_as_lit=True)) + return expr.register_plugin( + lib=lib, + symbol="from_local_datetime", + is_elementwise=True, + args=[from_tz], + kwargs={ + "to_tz": to_tz, + "ambiguous": ambiguous, + }, + ) - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], - ... } - ... ) - >>> df.with_columns( - ... result=pl.col("date_col").xdt.format_localized( - ... "%A, %d %B %Y", "uk_UA" - ... ) - ... ) - shape: (2, 2) - ┌─────────────────────┬──────────────────────────┐ - │ date_col ┆ result │ - │ --- ┆ --- │ - │ datetime[μs] ┆ str │ - ╞═════════════════════╪══════════════════════════╡ - │ 2024-08-24 00:00:00 ┆ субота, 24 серпня 2024 │ - │ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │ - └─────────────────────┴──────────────────────────┘ - """ - result = self._expr.register_plugin( - lib=lib, - symbol="format_localized", - is_elementwise=True, - args=[], - kwargs={"format": format, "locale": locale}, - ) - return cast(XDTExpr, result) +def to_local_datetime( + expr: str | pl.Expr, + time_zone: str | Expr, +) -> pl.Expr: + """Convert to local datetime in given time zone. - def to_julian_date(self) -> XDTExpr: - """Return the Julian date corresponding to given datetimes. + Parameters + ---------- + time_zone + Time zone to convert to. - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [ - ... datetime(2013, 1, 1, 0, 30), - ... datetime(2024, 1, 7, 13, 18, 51), - ... ], - ... } - ... ) - >>> with pl.Config(float_precision=10) as cfg: - ... df.with_columns( - ... julian_date=pl.col("date_col").xdt.to_julian_date() - ... ) - shape: (2, 2) - ┌─────────────────────┬────────────────────┐ - │ date_col ┆ julian_date │ - │ --- ┆ --- │ - │ datetime[μs] ┆ f64 │ - ╞═════════════════════╪════════════════════╡ - │ 2013-01-01 00:30:00 ┆ 2456293.5208333335 │ - │ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │ - └─────────────────────┴────────────────────┘ - """ - result = self._expr.register_plugin( - lib=lib, - symbol="to_julian_date", - is_elementwise=True, - args=[], - ) - return cast(XDTExpr, result) + Returns + ------- + Expr + Expression of data type :class:`DateTime`. - def ceil( - self, - every: str | pl.Expr, - ) -> XDTExpr: - """Find "ceiling" of datetime. - - Parameters - ---------- - every - Duration string, created with the - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - These strings can be combined: - - - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". + Examples + -------- + You can use `to_local_datetime` to figure out how a tz-aware datetime + will be expressed as a local datetime. - Returns - ------- - Expr - Expression of data type :class:`Utf8`. + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "date_col": [datetime(2020, 10, 10)] * 3, + ... "timezone": [ + ... "Europe/London", + ... "Africa/Kigali", + ... "America/New_York", + ... ], + ... } + ... ).with_columns(pl.col("date_col").dt.replace_time_zone("UTC")) + >>> df.with_columns( + ... pl.col("date_col") + ... .xdt.to_local_datetime(pl.col("timezone")) + ... .alias("local_dt") + ... ) + shape: (3, 3) + ┌─────────────────────────┬──────────────────┬─────────────────────┐ + │ date_col ┆ timezone ┆ local_dt │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs, UTC] ┆ str ┆ datetime[μs] │ + ╞═════════════════════════╪══════════════════╪═════════════════════╡ + │ 2020-10-10 00:00:00 UTC ┆ Europe/London ┆ 2020-10-10 01:00:00 │ + │ 2020-10-10 00:00:00 UTC ┆ Africa/Kigali ┆ 2020-10-10 02:00:00 │ + │ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │ + └─────────────────────────┴──────────────────┴─────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + time_zone = wrap_expr(parse_as_expression(time_zone, str_as_lit=True)) + return expr.register_plugin( + lib=lib, + symbol="to_local_datetime", + is_elementwise=True, + args=[time_zone], + ) - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], - ... } - ... ) - >>> df.with_columns(result=pl.col("date_col").xdt.ceil("1mo")) - shape: (2, 2) - ┌─────────────────────┬─────────────────────┐ - │ date_col ┆ result │ - │ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] │ - ╞═════════════════════╪═════════════════════╡ - │ 2024-08-24 00:00:00 ┆ 2024-09-01 00:00:00 │ - │ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │ - └─────────────────────┴─────────────────────┘ - """ - truncated = self._expr.dt.truncate(every) - result = ( - pl.when(self._expr == truncated) - .then(self._expr) - .otherwise(truncated.dt.offset_by(every)) - ) - return cast(XDTExpr, result) +def format_localized( + expr: str | pl.Expr, + format: str, # noqa: A002 + locale: str = "uk_UA", +) -> pl.Expr: + """Convert to local datetime in given time zone. - def base_utc_offset(self) -> XDTExpr: - """ - Base offset from UTC. + Parameters + ---------- + format + Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html + for what's available. + locale + Locale to use for formatting. Defaults to "uk_UA", because that's what the OP + requested https://github.com/pola-rs/polars/issues/12341. - This is usually constant for all datetimes in a given time zone, but - may vary in the rare case that a country switches time zone, like - Samoa (Apia) did at the end of 2011. + Returns + ------- + Expr + Expression of data type :class:`Utf8`. - Returns - ------- - Expr - Expression of data type :class:`Duration`. + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], + ... } + ... ) + >>> df.with_columns( + ... result=pl.col("date_col").xdt.format_localized( + ... "%A, %d %B %Y", "uk_UA" + ... ) + ... ) + shape: (2, 2) + ┌─────────────────────┬──────────────────────────┐ + │ date_col ┆ result │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪══════════════════════════╡ + │ 2024-08-24 00:00:00 ┆ субота, 24 серпня 2024 │ + │ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │ + └─────────────────────┴──────────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + return expr.register_plugin( + lib=lib, + symbol="format_localized", + is_elementwise=True, + args=[], + kwargs={"format": format, "locale": locale}, + ) - See Also - -------- - dst_offset : Daylight savings offset from UTC. +def to_julian_date(expr: str | pl.Expr) -> pl.Expr: + """Return the Julian date corresponding to given datetimes. - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "ts": [datetime(2011, 12, 29), datetime(2012, 1, 1)], - ... } - ... ) - >>> df = df.with_columns( - ... pl.col("ts").dt.replace_time_zone("Pacific/Apia") - ... ) - >>> df.with_columns( - ... pl.col("ts").xdt.base_utc_offset().alias("base_utc_offset") - ... ) - shape: (2, 2) - ┌────────────────────────────┬─────────────────┐ - │ ts ┆ base_utc_offset │ - │ --- ┆ --- │ - │ datetime[μs, Pacific/Apia] ┆ duration[ms] │ - ╞════════════════════════════╪═════════════════╡ - │ 2011-12-29 00:00:00 -10 ┆ -11h │ - │ 2012-01-01 00:00:00 +14 ┆ 13h │ - └────────────────────────────┴─────────────────┘ - """ - result = self._expr.register_plugin( - lib=lib, - symbol="base_utc_offset", - is_elementwise=True, - args=[], - ) - return cast(XDTExpr, result) + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "date_col": [ + ... datetime(2013, 1, 1, 0, 30), + ... datetime(2024, 1, 7, 13, 18, 51), + ... ], + ... } + ... ) + >>> with pl.Config(float_precision=10) as cfg: + ... df.with_columns( + ... julian_date=pl.col("date_col").xdt.to_julian_date() + ... ) + shape: (2, 2) + ┌─────────────────────┬────────────────────┐ + │ date_col ┆ julian_date │ + │ --- ┆ --- │ + │ datetime[μs] ┆ f64 │ + ╞═════════════════════╪════════════════════╡ + │ 2013-01-01 00:30:00 ┆ 2456293.5208333335 │ + │ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │ + └─────────────────────┴────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + return expr.register_plugin( + lib=lib, + symbol="to_julian_date", + is_elementwise=True, + args=[], + ) - def dst_offset(self) -> XDTExpr: - """ - Additional offset currently in effect (typically due to daylight saving time). +def ceil( + expr: str | pl.Expr, + every: str | pl.Expr, +) -> pl.Expr: + """Find "ceiling" of datetime. - Returns - ------- - Expr - Expression of data type :class:`Duration`. + Parameters + ---------- + every + Duration string, created with the + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + These strings can be combined: + + - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". - See Also - -------- - base_utc_offset : Base offset from UTC. + Returns + ------- + Expr + Expression of data type :class:`Utf8`. - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)], - ... } - ... ) - >>> df = df.with_columns( - ... pl.col("ts").dt.replace_time_zone("Europe/London") - ... ) - >>> df.with_columns(pl.col("ts").xdt.dst_offset().alias("dst_offset")) - shape: (2, 2) - ┌─────────────────────────────┬──────────────┐ - │ ts ┆ dst_offset │ - │ --- ┆ --- │ - │ datetime[μs, Europe/London] ┆ duration[ms] │ - ╞═════════════════════════════╪══════════════╡ - │ 2020-10-25 00:00:00 BST ┆ 1h │ - │ 2020-10-26 00:00:00 GMT ┆ 0ms │ - └─────────────────────────────┴──────────────┘ - """ - result = self._expr.register_plugin( - lib=lib, - symbol="dst_offset", - is_elementwise=True, - args=[], - ) - return cast(XDTExpr, result) + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], + ... } + ... ) + >>> df.with_columns(result=pl.col("date_col").xdt.ceil("1mo")) + shape: (2, 2) + ┌─────────────────────┬─────────────────────┐ + │ date_col ┆ result │ + │ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╡ + │ 2024-08-24 00:00:00 ┆ 2024-09-01 00:00:00 │ + │ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │ + └─────────────────────┴─────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + truncated = expr.dt.truncate(every) + return ( + pl.when(expr == truncated) + .then(expr) + .otherwise(truncated.dt.offset_by(every)) + ) - def day_name(self, locale: str | None = None) -> XDTExpr: - """ - Return day name, in specified locale (if specified). +def day_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: + """ + Return day name, in specified locale (if specified). - Returns - ------- - Expr - Expression of data type :class:`Utf8`. + Returns + ------- + Expr + Expression of data type :class:`Utf8`. - See Also - -------- - format_localized : format according to locale. + See Also + -------- + format_localized : format according to locale. - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)], - ... } - ... ) - >>> df.with_columns( - ... english_day_name=pl.col("ts").xdt.day_name(), - ... french_day_name=pl.col("ts").xdt.day_name("fr_FR"), - ... ukrainian_day_name=pl.col("ts").xdt.day_name("uk_UA"), - ... ) - shape: (2, 4) - ┌─────────────────────┬──────────────────┬─────────────────┬────────────────────┐ - │ ts ┆ english_day_name ┆ french_day_name ┆ ukrainian_day_name │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ str ┆ str │ - ╞═════════════════════╪══════════════════╪═════════════════╪════════════════════╡ - │ 2020-10-25 00:00:00 ┆ Sunday ┆ dimanche ┆ неділя │ - │ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │ - └─────────────────────┴──────────────────┴─────────────────┴────────────────────┘ - """ - if locale is None: - result = self._expr.dt.to_string("%A") - else: - result = self._expr.xdt.format_localized("%A", locale=locale) # type: ignore[attr-defined] - return cast(XDTExpr, result) + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)], + ... } + ... ) + >>> df.with_columns( + ... english_day_name=pl.col("ts").xdt.day_name(), + ... french_day_name=pl.col("ts").xdt.day_name("fr_FR"), + ... ukrainian_day_name=pl.col("ts").xdt.day_name("uk_UA"), + ... ) + shape: (2, 4) + ┌─────────────────────┬──────────────────┬─────────────────┬────────────────────┐ + │ ts ┆ english_day_name ┆ french_day_name ┆ ukrainian_day_name │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ str ┆ str │ + ╞═════════════════════╪══════════════════╪═════════════════╪════════════════════╡ + │ 2020-10-25 00:00:00 ┆ Sunday ┆ dimanche ┆ неділя │ + │ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │ + └─────────────────────┴──────────────────┴─────────────────┴────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + if locale is None: + result = expr.dt.to_string("%A") + else: + result = format_localized(expr, "%A", locale=locale) # type: ignore[attr-defined] + return result - def month_name(self, locale: str | None = None) -> XDTExpr: - """ - Return month name, in specified locale (if specified). +def month_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: + """ + Return month name, in specified locale (if specified). - Returns - ------- - Expr - Expression of data type :class:`Utf8`. + Returns + ------- + Expr + Expression of data type :class:`Utf8`. - See Also - -------- - format_localized : format according to locale. + See Also + -------- + format_localized : format according to locale. - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "ts": [datetime(2020, 10, 25), datetime(2020, 11, 26)], - ... } - ... ) - >>> df.with_columns( - ... english_month_name=pl.col("ts").xdt.month_name(), - ... french_month_name=pl.col("ts").xdt.month_name("fr_FR"), - ... ukrainian_month_name=pl.col("ts").xdt.month_name("uk_UA"), - ... ) - shape: (2, 4) - ┌─────────────────────┬────────────────────┬───────────────────┬──────────────────────┐ - │ ts ┆ english_month_name ┆ french_month_name ┆ ukrainian_month_name │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ str ┆ str │ - ╞═════════════════════╪════════════════════╪═══════════════════╪══════════════════════╡ - │ 2020-10-25 00:00:00 ┆ October ┆ octobre ┆ жовтня │ - │ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │ - └─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘ - """ - if locale is None: - result = self._expr.dt.to_string("%B") - else: - result = self._expr.xdt.format_localized("%B", locale=locale) # type: ignore[attr-defined] - return cast(XDTExpr, result) + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "ts": [datetime(2020, 10, 25), datetime(2020, 11, 26)], + ... } + ... ) + >>> df.with_columns( + ... english_month_name=pl.col("ts").xdt.month_name(), + ... french_month_name=pl.col("ts").xdt.month_name("fr_FR"), + ... ukrainian_month_name=pl.col("ts").xdt.month_name("uk_UA"), + ... ) + shape: (2, 4) + ┌─────────────────────┬────────────────────┬───────────────────┬──────────────────────┐ + │ ts ┆ english_month_name ┆ french_month_name ┆ ukrainian_month_name │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ str ┆ str │ + ╞═════════════════════╪════════════════════╪═══════════════════╪══════════════════════╡ + │ 2020-10-25 00:00:00 ┆ October ┆ octobre ┆ жовтня │ + │ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │ + └─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘ + """ + if isinstance(expr, str): + expr = pl.col(expr) + if locale is None: + result = expr.dt.to_string("%B") + else: + result = format_localized(expr, "%B", locale=locale) + return result class XDTExpr(pl.Expr): @@ -842,9 +750,7 @@ def workday_count( elif not isinstance(end, pl.Expr): end = pl.lit(end) - return end.xdt.sub(start, weekend=weekend, holidays=holidays).alias( # type: ignore[no-any-return, attr-defined] - "workday_count", - ) + return sub(end, start, weekend=weekend, holidays=holidays) # type: ignore[no-any-return, attr-defined] __all__ = [ diff --git a/tests/ceil_test.py b/tests/ceil_test.py index 78f463e..e9d552e 100644 --- a/tests/ceil_test.py +++ b/tests/ceil_test.py @@ -1,5 +1,4 @@ from datetime import datetime -import pytest import polars as pl import polars_xdt as xdt @@ -12,6 +11,6 @@ def test_ceil() -> None: }, schema={"date_col": pl.Datetime("ms")}, ) - result = df.select(result=xdt.col('date_col').xdt.ceil('1mo'))['result'] + result = df.select(result=xdt.ceil('date_col', '1mo'))['result'] assert result[0] == datetime(2024, 9, 1, 0, 0, 0, 0) assert result[1] == datetime(2024, 10, 1, 0, 0, 0, 0) diff --git a/tests/julian_date_test.py b/tests/julian_date_test.py index fabddcd..85c7eae 100644 --- a/tests/julian_date_test.py +++ b/tests/julian_date_test.py @@ -18,7 +18,7 @@ def test_against_pandas( date: dt.date, ) -> None: df = pl.DataFrame({'a': [date]}, schema={'a': pl.Datetime('ms')}) - result = df.select(xdt.col('a').xdt.to_julian_date())['a'].item() + result = df.select(xdt.to_julian_date('a'))['a'].item() expected = pd.Timestamp(df['a'].item()).to_julian_date() assert result == expected @@ -29,6 +29,6 @@ def test_against_pandas_date( date: dt.date, ) -> None: df = pl.DataFrame({'a': [date]}) - result = df.select(xdt.col('a').xdt.to_julian_date())['a'].item() + result = df.select(xdt.to_julian_date('a'))['a'].item() expected = pd.Timestamp(df['a'].item()).to_julian_date() assert result == expected diff --git a/tests/offsets_test.py b/tests/offsets_test.py deleted file mode 100644 index 53fd9e5..0000000 --- a/tests/offsets_test.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations -import pytest -import polars as pl -import polars_xdt as xdt -from datetime import datetime - -from typing import TYPE_CHECKING -from polars.testing import assert_series_equal -from polars.exceptions import ComputeError -if TYPE_CHECKING: - from polars.type_aliases import TimeUnit - -@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) -def test_base_utc_offset(time_unit: TimeUnit) -> None: - df = pl.datetime_range( - datetime(2011, 12, 29), - datetime(2012, 1, 1), - "2d", - time_zone="Pacific/Apia", - eager=True, - ).dt.cast_time_unit(time_unit).to_frame('a') - result = df.select(xdt.col('a').xdt.base_utc_offset().alias("base_utc_offset"))['base_utc_offset'] - expected = pl.Series( - "base_utc_offset", - [-11 * 3600 * 1000, 13 * 3600 * 1000], - dtype=pl.Duration("ms"), - ) - assert_series_equal(result, expected) - - -def test_base_utc_offset_lazy_schema() -> None: - ser = pl.datetime_range( - datetime(2020, 10, 25), - datetime(2020, 10, 26), - time_zone="Europe/London", - eager=True, - ) - df = pl.DataFrame({"ts": ser}).lazy() - result = df.with_columns(base_utc_offset=xdt.col("ts").xdt.base_utc_offset()).schema - expected = { - "ts": pl.Datetime(time_unit="us", time_zone="Europe/London"), - "base_utc_offset": pl.Duration(time_unit="ms"), - } - assert result == expected - - -def test_base_utc_offset_invalid() -> None: - df = pl.datetime_range( - datetime(2011, 12, 29), - datetime(2012, 1, 1), - "2d", - eager=True, - ).to_frame('a') - with pytest.raises( - ComputeError, - match=r"base_utc_offset only works on Datetime type", - ): - df.select(xdt.col('a').xdt.base_utc_offset()) - - -@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) -def test_dst_offset(time_unit: TimeUnit) -> None: - df = pl.datetime_range( - datetime(2020, 10, 25), - datetime(2020, 10, 26), - time_zone="Europe/London", - eager=True, - ).dt.cast_time_unit(time_unit).to_frame('a') - result = df.select(xdt.col('a').xdt.dst_offset().alias("dst_offset"))['dst_offset'] - expected = pl.Series("dst_offset", [3_600 * 1_000, 0], dtype=pl.Duration("ms")) - assert_series_equal(result, expected) - - -def test_dst_offset_lazy_schema() -> None: - ser = pl.datetime_range( - datetime(2020, 10, 25), - datetime(2020, 10, 26), - time_zone="Europe/London", - eager=True, - ) - df = pl.DataFrame({"ts": ser}).lazy() - result = df.with_columns(dst_offset=xdt.col("ts").xdt.dst_offset()).schema - expected = { - "ts": pl.Datetime(time_unit="us", time_zone="Europe/London"), - "dst_offset": pl.Duration(time_unit="ms"), - } - assert result == expected - - -def test_dst_offset_invalid() -> None: - df = pl.datetime_range( - datetime(2011, 12, 29), - datetime(2012, 1, 1), - "2d", - eager=True, - ).to_frame('a') - with pytest.raises( - ComputeError, - match=r"base_utc_offset only works on Datetime type", - ): - df.select(xdt.col('a').xdt.dst_offset()) \ No newline at end of file diff --git a/tests/test_business_offsets.py b/tests/test_business_offsets.py index d21fc16..32a4a6c 100644 --- a/tests/test_business_offsets.py +++ b/tests/test_business_offsets.py @@ -27,7 +27,7 @@ def get_result( if dtype == pl.Date: result = ( pl.DataFrame({"ts": [date]}) - .select(xdt.col("ts").xdt.offset_by(by=by, **kwargs))["ts"] # type: ignore[arg-type] + .select(xdt.offset_by('ts', by=by, **kwargs))["ts"] # type: ignore[arg-type] .item() ) else: @@ -35,10 +35,10 @@ def get_result( result = ( pl.DataFrame({"ts": [dt.datetime(date.year, date.month, date.day)]}) .select( - pl.col("ts") + xdt.offset_by(pl.col("ts") .dt.cast_time_unit(dtype.time_unit) # type: ignore[union-attr] .dt.replace_time_zone(dtype.time_zone) # type: ignore[union-attr] - .xdt.offset_by(by=by, **kwargs) # type: ignore[attr-defined] + , by=by, **kwargs) .dt.date() )["ts"] .item() @@ -156,7 +156,7 @@ def test_extra_args(by: str, expected: dt.datetime) -> None: df = pl.DataFrame({"dates": [start]}) result = ( df.with_columns( - dates_shifted=xdt.col("dates").xdt.offset_by(by=by) + dates_shifted=xdt.offset_by('dates', by=by) ).with_columns(end_wday=pl.col("dates_shifted").dt.strftime("%a")) )["dates_shifted"].item() assert result == expected @@ -167,7 +167,7 @@ def test_extra_args_w_series() -> None: df = pl.DataFrame({"dates": [start] * 2, "by": ["1bd2h", "-1bd1h"]}) result = ( df.with_columns( - dates_shifted=xdt.col("dates").xdt.offset_by(by=pl.col("by")) + dates_shifted=xdt.offset_by('dates', by=pl.col("by")) ).with_columns(end_wday=pl.col("dates_shifted").dt.strftime("%a")) )["dates_shifted"] assert result[0] == dt.datetime(2000, 1, 4, 2) @@ -181,7 +181,7 @@ def test_starting_on_non_business() -> None: df = pl.DataFrame({"dates": [start]}) with pytest.raises(pl.ComputeError): df.with_columns( - dates_shifted=xdt.col("dates").xdt.offset_by( + dates_shifted=xdt.offset_by('dates', by=f"{n}bd", weekend=weekend, ) @@ -192,7 +192,7 @@ def test_starting_on_non_business() -> None: holidays = [start] with pytest.raises(pl.ComputeError): df.with_columns( - dates_shifted=xdt.col("dates").xdt.offset_by( + dates_shifted=xdt.offset_by('dates', by=f"{n}bd", holidays=holidays, weekend=weekend, @@ -206,8 +206,8 @@ def test_within_group_by() -> None: result = ( df.group_by(["a"]).agg( - minDate=pl.col.date.min().xdt.offset_by("-3bd"), # type: ignore[attr-defined] - maxDate=pl.col.date.max().xdt.offset_by("3bd"), # type: ignore[attr-defined] + minDate=xdt.offset_by(pl.col.date.min(), "-3bd"), # type: ignore[attr-defined] + maxDate=xdt.offset_by(pl.col.date.max(), "3bd"), # type: ignore[attr-defined] ) ).sort("a", descending=True) expected = pl.DataFrame( @@ -225,4 +225,4 @@ def test_invalid_roll_strategy() -> None: {"date": pl.date_range(dt.date(2023, 12, 1), dt.date(2023, 12, 5), eager=True)} ) with pytest.raises(pl.ComputeError): - df.with_columns(xdt.col("date").xdt.offset_by("1bd", roll="cabbage")) # type: ignore[arg-type] + df.with_columns(xdt.offset_by('date', "1bd", roll="cabbage")) # type: ignore[arg-type] diff --git a/tests/test_format_localized.py b/tests/test_format_localized.py index 6cbf450..c62d95d 100644 --- a/tests/test_format_localized.py +++ b/tests/test_format_localized.py @@ -20,9 +20,9 @@ def test_format_localized_datetime(time_unit: TimeUnit, expected_us: str, expect "date_col": ['2020-01-01T00:00:00.123456789'], }, ).select(pl.col("date_col").str.to_datetime(time_unit=time_unit)) - result = df.select(result=xdt.col("date_col").xdt.format_localized("%A, %d %B %Y %H:%M:%S%.f", 'en_US'))['result'] + result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %H:%M:%S%.f", 'en_US'))['result'] assert result[0] == expected_us - result = df.select(result=xdt.col("date_col").xdt.format_localized("%A, %d %B %Y %H:%M:%S%.f", 'uk_UA'))['result'] + result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %H:%M:%S%.f", 'uk_UA'))['result'] assert result[0] == expected_ukr def test_format_localized_date() -> None: @@ -31,10 +31,10 @@ def test_format_localized_date() -> None: "date_col": [date(2024, 8, 24), date(2024, 10, 1)], }, ) - result = df.select(result=xdt.col("date_col").xdt.format_localized("%A, %d %B %Y", 'en_US'))['result'] + result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y", 'en_US'))['result'] assert result[0] == 'Saturday, 24 August 2024' assert result[1] == 'Tuesday, 01 October 2024' - result = df.select(result=xdt.col("date_col").xdt.format_localized("%A, %d %B %Y", 'uk_UA'))['result'] + result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y", 'uk_UA'))['result'] assert result[0] == 'субота, 24 серпня 2024' assert result[1] == 'вівторок, 01 жовтня 2024' @@ -45,7 +45,7 @@ def test_tz_aware() -> None: }, schema={"date_col": pl.Datetime("ns", "Europe/London")}, ) - result = (df.select(result=xdt.col("date_col").xdt.format_localized("%A, %d %B %Y %z", "uk_UA"))) + result = (df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %z", "uk_UA"))) assert result['result'][0] == 'субота, 24 серпня 2024 +0100' assert result['result'][1] == 'вівторок, 01 жовтня 2024 +0100' @@ -57,5 +57,5 @@ def test_pre_epoch(time_unit: TimeUnit) -> None: }, schema={"date_col": pl.Datetime(time_unit, "Europe/London")}, ) - result = (df.select(result=xdt.col("date_col").xdt.format_localized("%A, %d %B %Y %z", "en_US"))) + result = (df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %z", "en_US"))) assert result['result'][0] == 'Friday, 01 January 1960 +0000' diff --git a/tests/test_is_busday.py b/tests/test_is_busday.py index 94a27a1..70b9041 100644 --- a/tests/test_is_busday.py +++ b/tests/test_is_busday.py @@ -20,7 +20,7 @@ def get_result( ) -> int: return ( # type: ignore[no-any-return] pl.DataFrame({"date": [date]}) - .select(xdt.col("date").xdt.is_workday(weekend=weekend, holidays=holidays))[ + .select(xdt.is_workday('date', weekend=weekend, holidays=holidays))[ "date" ] .item() diff --git a/tests/test_sub.py b/tests/test_sub.py index 0ceed7d..b919d9a 100644 --- a/tests/test_sub.py +++ b/tests/test_sub.py @@ -23,7 +23,7 @@ def get_result( ) -> int: return ( # type: ignore[no-any-return] pl.DataFrame({"end_date": [end_date]}) - .select(n=xdt.col("end_date").xdt.sub(start_date, weekend=weekend, holidays=holidays))["n"] # type: ignore[arg-type] + .select(n=xdt.workday_count(start_date, "end_date", weekend=weekend, holidays=holidays))["n"] # type: ignore[arg-type] .item() ) @@ -51,13 +51,9 @@ def test_against_np_busday_count( holidays: list[dt.date], function: Callable[[dt.date], dt.date | pl.Series], ) -> None: - result = get_result( - function(start_date), end_date, weekend=weekend, holidays=holidays - ) + result = get_result( function(start_date), end_date, weekend=weekend, holidays=holidays) weekmask = [0 if reverse_mapping[i] in weekend else 1 for i in range(1, 8)] - expected = np.busday_count( - start_date, end_date, weekmask=weekmask, holidays=holidays - ) + expected = np.busday_count( start_date, end_date, weekmask=weekmask, holidays=holidays) if start_date > end_date and tuple( int(v) for v in np.__version__.split(".")[:2] ) < (1, 25): @@ -112,8 +108,7 @@ def test_empty_weekmask() -> None: ) with pytest.raises(ValueError): df.select( - xdt.col("end").xdt.sub( - "start", weekend=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + xdt.workday_count("start", "end", weekend=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] ) ) @@ -125,9 +120,7 @@ def test_sub_lit() -> None: } ) result = df.select( - xdt.col("end").xdt.sub( - pl.lit(dt.date(2020, 1, 1)), - ) + xdt.workday_count(pl.lit(dt.date(2020, 1, 1)), "end") ) assert result["end"][0] == 2 assert result["end"][1] == 3 @@ -140,12 +133,12 @@ def test_workday_count() -> None: "end": [dt.date(2020, 1, 8), dt.date(2020, 1, 20)], } ) - result = df.with_columns(xdt.workday_count("start", "end")) + result = df.with_columns(workday_count=xdt.workday_count("start", "end")) assert result["workday_count"][0] == 3 assert result["workday_count"][1] == 10 - result = df.with_columns(xdt.workday_count("start", dt.date(2020, 1, 8))) + result = df.with_columns(workday_count=xdt.workday_count("start", dt.date(2020, 1, 8))) assert result["workday_count"][0] == 3 assert result["workday_count"][1] == 2 - result = df.with_columns(xdt.workday_count(dt.date(2020, 1, 5), pl.col("end"))) + result = df.with_columns(workday_count=xdt.workday_count(dt.date(2020, 1, 5), pl.col("end"))) assert result["workday_count"][0] == 2 assert result["workday_count"][1] == 10 diff --git a/tests/test_timezone.py b/tests/test_timezone.py index 2d0c889..1fb54e0 100644 --- a/tests/test_timezone.py +++ b/tests/test_timezone.py @@ -39,7 +39,7 @@ def test_convert_tz_to_local_datetime( expected = df.with_columns(pl.lit(local_date).alias("local_dt")) result = df.with_columns( - xdt.col("date").xdt.to_local_datetime(pl.col("timezone")).alias("local_dt") + xdt.to_local_datetime('date', pl.col("timezone")).alias("local_dt") ) assert_frame_equal(result, expected) @@ -75,8 +75,7 @@ def test_convert_tz_from_local_datetime( ) result = df.with_columns( - xdt.col("local_date") - .xdt.from_local_datetime(pl.col("timezone"), "Europe/London") + xdt.from_local_datetime('local_date', pl.col("timezone"), "Europe/London") .alias("date") ) @@ -93,8 +92,7 @@ def test_convert_tz_from_local_datetime_literal() -> None: ) result = df.with_columns( - xdt.col("local_date") - .xdt.from_local_datetime("America/New_York", "Europe/London") + xdt.from_local_datetime('local_date', "America/New_York", "Europe/London") .alias("date") ) assert_frame_equal(result, expected) @@ -108,7 +106,7 @@ def test_convert_tz_to_local_datetime_literal() -> None: expected = df.with_columns(pl.lit(datetime(2020, 10, 14, 20, 0)).alias("local_dt")) result = df.with_columns( - xdt.col("date").xdt.to_local_datetime("America/New_York").alias("local_dt") + xdt.to_local_datetime("date", "America/New_York").alias("local_dt") ) assert_frame_equal(result, expected) From cfe5111cc00a788585c88263a5a4b484c4fc503c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:19:26 +0000 Subject: [PATCH 2/8] fix lhs-rule --- polars_xdt/__init__.py | 38 +++++++++++++++++++------------------- src/sub.rs | 2 +- tests/test_sub.py | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 797de3b..9b7714f 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -197,23 +197,7 @@ def sub( weekend: Sequence[str] = ("Sat", "Sun"), holidays: Sequence[date] | None = None, ) -> pl.Expr: - weekmask = get_weekmask(weekend) - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - return expr.register_plugin( - lib=lib, - symbol="sub", - is_elementwise=True, - args=[end_dates], - kwargs={ - "weekmask": weekmask, - "holidays": holidays_int, - }, - ) + ... def is_workday( expr: str | pl.Expr, @@ -697,7 +681,7 @@ def workday_count( end: str | pl.Expr | date, weekend: Sequence[str] = ("Sat", "Sun"), holidays: Sequence[date] | None = None, -) -> XDTExpr: +) -> pl.Expr: """Count the number of workdays between two columns of dates. Parameters @@ -750,7 +734,23 @@ def workday_count( elif not isinstance(end, pl.Expr): end = pl.lit(end) - return sub(end, start, weekend=weekend, holidays=holidays) # type: ignore[no-any-return, attr-defined] + weekmask = get_weekmask(weekend) + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, + ) + return start.register_plugin( + lib=lib, + symbol="sub", + is_elementwise=True, + args=[end], + kwargs={ + "weekmask": weekmask, + "holidays": holidays_int, + }, + ) __all__ = [ diff --git a/src/sub.rs b/src/sub.rs index 5088300..eefc654 100644 --- a/src/sub.rs +++ b/src/sub.rs @@ -49,8 +49,8 @@ fn date_diff( } pub(crate) fn impl_sub( - end_dates: &Series, start_dates: &Series, + end_dates: &Series, weekmask: &[bool; 7], holidays: Vec, ) -> PolarsResult { diff --git a/tests/test_sub.py b/tests/test_sub.py index b919d9a..265eeb2 100644 --- a/tests/test_sub.py +++ b/tests/test_sub.py @@ -122,8 +122,8 @@ def test_sub_lit() -> None: result = df.select( xdt.workday_count(pl.lit(dt.date(2020, 1, 1)), "end") ) - assert result["end"][0] == 2 - assert result["end"][1] == 3 + assert result["literal"][0] == 2 + assert result["literal"][1] == 3 def test_workday_count() -> None: From 40b667a54393784d561abb986ff3a1c9c799c083 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:54:14 +0000 Subject: [PATCH 3/8] fixup --- .gitignore | 3 +- Makefile | 2 +- README.md | 15 +- docs/API.rst | 20 +- polars_xdt/__init__.py | 771 +-------------------------------- polars_xdt/functions.py | 703 ++++++++++++++++++++++++++++++ polars_xdt/namespace.py | 23 + src/expressions.rs | 4 +- src/sub.rs | 2 +- tests/ceil_test.py | 14 +- tests/julian_date_test.py | 19 +- tests/test_business_offsets.py | 73 +++- tests/test_date_range.py | 6 +- tests/test_format_localized.py | 78 +++- tests/test_is_busday.py | 14 +- tests/test_sub.py | 59 ++- tests/test_timezone.py | 16 +- 17 files changed, 957 insertions(+), 865 deletions(-) create mode 100644 polars_xdt/functions.py create mode 100644 polars_xdt/namespace.py diff --git a/.gitignore b/.gitignore index 6513c04..d88e5dd 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ rust-toolchain.toml *.bat *.js docs/_build -*.so \ No newline at end of file +docs/api/* +*.so diff --git a/Makefile b/Makefile index 9fdee35..326f012 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ install-release: venv pre-commit: venv cargo fmt --all && cargo clippy --all-features venv/bin/python -m ruff check . --fix --exit-non-zero-on-fix - venv/bin/python -m ruff format + venv/bin/python -m ruff format polars_xdt tests venv/bin/python -m mypy polars_xdt tests test: venv diff --git a/README.md b/README.md index 6fb64f5..c02c2c8 100644 --- a/README.md +++ b/README.md @@ -30,16 +30,6 @@ Then, you'll need to install `polars-xdt`: pip install polars-xdt ``` -Then, if you can run -```python -import polars as pl -import polars_xdt # noqa: F401 - -print(pl.col('a').xdt) -``` -and see something like ``, -it means installation all worked correctly! - Read the [documentation](https://marcogorelli.github.io/polars-xdt-docs/) for a little tutorial and API reference. Basic Example @@ -49,7 +39,7 @@ Say we start with from datetime import date import polars as pl -import polars_xdt # noqa: F401 +import polars_xdt as xdt df = pl.DataFrame( @@ -61,7 +51,8 @@ Let's shift `Date` forwards by 5 days, excluding Saturday and Sunday: ```python result = df.with_columns( - date_shifted=pl.col("date").xdt.offset_by( + date_shifted=xdt.offset_by( + 'date', '5bd', weekend=('Sat', 'Sun'), ) diff --git a/docs/API.rst b/docs/API.rst index 8926f36..3f7630b 100644 --- a/docs/API.rst +++ b/docs/API.rst @@ -6,14 +6,12 @@ API polars_xdt.date_range polars_xdt.workday_count - polars_xdt.ExprXDTNamespace.base_utc_offset - polars_xdt.ExprXDTNamespace.ceil - polars_xdt.ExprXDTNamespace.day_name - polars_xdt.ExprXDTNamespace.dst_offset - polars_xdt.ExprXDTNamespace.format_localized - polars_xdt.ExprXDTNamespace.from_local_datetime - polars_xdt.ExprXDTNamespace.is_workday - polars_xdt.ExprXDTNamespace.month_name - polars_xdt.ExprXDTNamespace.offset_by - polars_xdt.ExprXDTNamespace.to_local_datetime - polars_xdt.ExprXDTNamespace.to_julian_date + polars_xdt.ceil + polars_xdt.day_name + polars_xdt.format_localized + polars_xdt.from_local_datetime + polars_xdt.is_workday + polars_xdt.month_name + polars_xdt.offset_by + polars_xdt.to_local_datetime + polars_xdt.to_julian_date diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 9b7714f..717456b 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -1,761 +1,32 @@ from __future__ import annotations -import re -import sys -from datetime import date -from typing import TYPE_CHECKING, Iterable, Literal, Protocol, Sequence, cast - -import polars as pl -from polars.utils._parse_expr_input import parse_as_expression -from polars.utils._wrap import wrap_expr -from polars.utils.udfs import _get_shared_lib_location - +from polars_xdt.functions import ( + ceil, + day_name, + format_localized, + from_local_datetime, + is_workday, + month_name, + offset_by, + to_julian_date, + to_local_datetime, + workday_count, +) from polars_xdt.ranges import date_range from ._internal import __version__ -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - -if TYPE_CHECKING: - from polars.type_aliases import PolarsDataType, IntoExpr - -RollStrategy: TypeAlias = Literal["raise", "forward", "backward"] - - -lib = _get_shared_lib_location(__file__) - -mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7} -reverse_mapping = {value: key for key, value in mapping.items()} - -if TYPE_CHECKING: - from polars import Expr - from polars.type_aliases import Ambiguous - - -def get_weekmask(weekend: Sequence[str]) -> list[bool]: - if weekend == ("Sat", "Sun"): - weekmask = [True, True, True, True, True, False, False] - else: - weekmask = [reverse_mapping[i] not in weekend for i in range(1, 8)] - if sum(weekmask) == 0: - msg = f"At least one day of the week must be a business day. Got weekend={weekend}" - raise ValueError(msg) - return weekmask - - -def offset_by( - expr: IntoExpr, - by: str | pl.Expr, - *, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, - roll: RollStrategy = "raise", -) -> pl.Expr: - """Offset this date by a relative time offset. - - Parameters - ---------- - by - The offset to apply. This can be a string of the form "nbd" (where n - is an integer), or a polars expression that evaluates to such a string. - Additional units are passed to `polars.dt.offset_by`. - weekend - The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). - holidays - The holidays to exclude from the calculation. Defaults to None. - roll - How to handle dates that fall on a non-workday. - - - "raise" raise an error (default). - - "forward" roll forward to the next business day. - - "backward" roll backward to the previous business day. - - Returns - ------- - polars.Expr - - Examples - -------- - >>> import polars as pl - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... {"date": [date(2023, 4, 3), date(2023, 9, 1), date(2024, 1, 4)]} - ... ) - >>> df.with_columns( - ... date_shifted=pl.col("date").xdt.offset_by("1bd"), - ... ) - shape: (3, 2) - ┌────────────┬──────────────┐ - │ date ┆ date_shifted │ - │ --- ┆ --- │ - │ date ┆ date │ - ╞════════════╪══════════════╡ - │ 2023-04-03 ┆ 2023-04-04 │ - │ 2023-09-01 ┆ 2023-09-04 │ - │ 2024-01-04 ┆ 2024-01-05 │ - └────────────┴──────────────┘ - - You can also specify custom weekends and holidays: - - >>> import holidays - >>> holidays_england = holidays.country_holidays( - ... "UK", subdiv="ENG", years=[2023, 2024] - ... ) - >>> df.with_columns( - ... date_shifted=pl.col("date").xdt.offset_by( - ... "5bd", - ... holidays=holidays_england, - ... weekend=["Fri", "Sat"], - ... roll="backward", - ... ), - ... ) - shape: (3, 2) - ┌────────────┬──────────────┐ - │ date ┆ date_shifted │ - │ --- ┆ --- │ - │ date ┆ date │ - ╞════════════╪══════════════╡ - │ 2023-04-03 ┆ 2023-04-11 │ - │ 2023-09-01 ┆ 2023-09-07 │ - │ 2024-01-04 ┆ 2024-01-11 │ - └────────────┴──────────────┘ - - You can also pass expressions to `by`: - - >>> df = pl.DataFrame( - ... { - ... "date": [ - ... date(2023, 4, 3), - ... date(2023, 9, 1), - ... date(2024, 1, 4), - ... ], - ... "by": ["1bd", "2bd", "-3bd"], - ... } - ... ) - >>> df.with_columns( - ... date_shifted=pl.col("date").xdt.offset_by(pl.col("by")) - ... ) - shape: (3, 3) - ┌────────────┬──────┬──────────────┐ - │ date ┆ by ┆ date_shifted │ - │ --- ┆ --- ┆ --- │ - │ date ┆ str ┆ date │ - ╞════════════╪══════╪══════════════╡ - │ 2023-04-03 ┆ 1bd ┆ 2023-04-04 │ - │ 2023-09-01 ┆ 2bd ┆ 2023-09-05 │ - │ 2024-01-04 ┆ -3bd ┆ 2024-01-01 │ - └────────────┴──────┴──────────────┘ - """ - expr = wrap_expr(parse_as_expression(expr)) - if ( - isinstance(by, str) - and (match := re.search(r"(\d+bd)", by)) is not None - and (len(match.group(1)) == len(by)) - ): - # Fast path - do we have a business day offset, and nothing else? - n: int | pl.Expr = int(by[:-2]) - fastpath = True - else: - if not isinstance(by, pl.Expr): - by = pl.lit(by) - n = (by.str.extract(r"^(-?)") + by.str.extract(r"(\d+)bd")).cast( - pl.Int32, - ) - by = by.str.replace(r"(\d+bd)", "") - fastpath = False - - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - weekmask = get_weekmask(weekend) - - result = expr.register_plugin( - lib=lib, - symbol="advance_n_days", - is_elementwise=True, - args=[n], - kwargs={ - "holidays": holidays_int, - "weekmask": weekmask, - "roll": roll, - }, - ) - if fastpath: - return result - return result.dt.offset_by(by) - -def sub( - expr: pl.Expr, - end_dates: pl.Expr, - *, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, -) -> pl.Expr: - ... - -def is_workday( - expr: str | pl.Expr, - *, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, -) -> pl.Expr: - """Determine whether a day is a workday. - - Parameters - ---------- - weekend - The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). - holidays - The holidays to exclude from the calculation. Defaults to None. This should - be a list of ``datetime.date`` s. - - Returns - ------- - polars.Expr - - Examples - -------- - >>> from datetime import date - >>> import polars as pl - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date": [ - ... date(2023, 1, 4), - ... date(2023, 5, 1), - ... date(2023, 9, 9), - ... ], - ... } - ... ) - >>> df.with_columns(is_workday=pl.col("date").xdt.is_workday()) - shape: (3, 2) - ┌────────────┬────────────┐ - │ date ┆ is_workday │ - │ --- ┆ --- │ - │ date ┆ bool │ - ╞════════════╪════════════╡ - │ 2023-01-04 ┆ true │ - │ 2023-05-01 ┆ true │ - │ 2023-09-09 ┆ false │ - └────────────┴────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - weekmask = get_weekmask(weekend) - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - return expr.register_plugin( - lib=lib, - symbol="is_workday", - is_elementwise=True, - args=[], - kwargs={ - "weekmask": weekmask, - "holidays": holidays_int, - }, - ) - -def from_local_datetime( - expr: str | pl.Expr, - from_tz: str | Expr, - to_tz: str, - ambiguous: Ambiguous = "raise", -) -> pl.Expr: - """Converts from local datetime in given time zone to new timezone. - - Parameters - ---------- - from_tz - Current timezone of each datetime - to_tz - Timezone to convert to - ambiguous - Determine how to deal with ambiguous datetimes: - - - `'raise'` (default): raise - - `'earliest'`: use the earliest datetime - - `'latest'`: use the latest datetime - - Returns - ------- - Expr - Expression of data type :class:`DateTime`. - - Examples - -------- - You can go from a localized datetime back to expressing the datetimes - in a single timezone with `from_local_datetime`. - - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "local_dt": [ - ... datetime(2020, 10, 10, 1), - ... datetime(2020, 10, 10, 2), - ... datetime(2020, 10, 9, 20), - ... ], - ... "timezone": [ - ... "Europe/London", - ... "Africa/Kigali", - ... "America/New_York", - ... ], - ... } - ... ) - >>> df.with_columns( - ... pl.col("local_dt") - ... .xdt.from_local_datetime(pl.col("timezone"), "UTC") - ... .alias("date") - ... ) - shape: (3, 3) - ┌─────────────────────┬──────────────────┬─────────────────────────┐ - │ local_dt ┆ timezone ┆ date │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ datetime[μs, UTC] │ - ╞═════════════════════╪══════════════════╪═════════════════════════╡ - │ 2020-10-10 01:00:00 ┆ Europe/London ┆ 2020-10-10 00:00:00 UTC │ - │ 2020-10-10 02:00:00 ┆ Africa/Kigali ┆ 2020-10-10 00:00:00 UTC │ - │ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │ - └─────────────────────┴──────────────────┴─────────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - from_tz = wrap_expr(parse_as_expression(from_tz, str_as_lit=True)) - return expr.register_plugin( - lib=lib, - symbol="from_local_datetime", - is_elementwise=True, - args=[from_tz], - kwargs={ - "to_tz": to_tz, - "ambiguous": ambiguous, - }, - ) - -def to_local_datetime( - expr: str | pl.Expr, - time_zone: str | Expr, -) -> pl.Expr: - """Convert to local datetime in given time zone. - - Parameters - ---------- - time_zone - Time zone to convert to. - - Returns - ------- - Expr - Expression of data type :class:`DateTime`. - - Examples - -------- - You can use `to_local_datetime` to figure out how a tz-aware datetime - will be expressed as a local datetime. - - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [datetime(2020, 10, 10)] * 3, - ... "timezone": [ - ... "Europe/London", - ... "Africa/Kigali", - ... "America/New_York", - ... ], - ... } - ... ).with_columns(pl.col("date_col").dt.replace_time_zone("UTC")) - >>> df.with_columns( - ... pl.col("date_col") - ... .xdt.to_local_datetime(pl.col("timezone")) - ... .alias("local_dt") - ... ) - shape: (3, 3) - ┌─────────────────────────┬──────────────────┬─────────────────────┐ - │ date_col ┆ timezone ┆ local_dt │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs, UTC] ┆ str ┆ datetime[μs] │ - ╞═════════════════════════╪══════════════════╪═════════════════════╡ - │ 2020-10-10 00:00:00 UTC ┆ Europe/London ┆ 2020-10-10 01:00:00 │ - │ 2020-10-10 00:00:00 UTC ┆ Africa/Kigali ┆ 2020-10-10 02:00:00 │ - │ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │ - └─────────────────────────┴──────────────────┴─────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - time_zone = wrap_expr(parse_as_expression(time_zone, str_as_lit=True)) - return expr.register_plugin( - lib=lib, - symbol="to_local_datetime", - is_elementwise=True, - args=[time_zone], - ) - -def format_localized( - expr: str | pl.Expr, - format: str, # noqa: A002 - locale: str = "uk_UA", -) -> pl.Expr: - """Convert to local datetime in given time zone. - - Parameters - ---------- - format - Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html - for what's available. - locale - Locale to use for formatting. Defaults to "uk_UA", because that's what the OP - requested https://github.com/pola-rs/polars/issues/12341. - - Returns - ------- - Expr - Expression of data type :class:`Utf8`. - - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], - ... } - ... ) - >>> df.with_columns( - ... result=pl.col("date_col").xdt.format_localized( - ... "%A, %d %B %Y", "uk_UA" - ... ) - ... ) - shape: (2, 2) - ┌─────────────────────┬──────────────────────────┐ - │ date_col ┆ result │ - │ --- ┆ --- │ - │ datetime[μs] ┆ str │ - ╞═════════════════════╪══════════════════════════╡ - │ 2024-08-24 00:00:00 ┆ субота, 24 серпня 2024 │ - │ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │ - └─────────────────────┴──────────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - return expr.register_plugin( - lib=lib, - symbol="format_localized", - is_elementwise=True, - args=[], - kwargs={"format": format, "locale": locale}, - ) - -def to_julian_date(expr: str | pl.Expr) -> pl.Expr: - """Return the Julian date corresponding to given datetimes. - - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [ - ... datetime(2013, 1, 1, 0, 30), - ... datetime(2024, 1, 7, 13, 18, 51), - ... ], - ... } - ... ) - >>> with pl.Config(float_precision=10) as cfg: - ... df.with_columns( - ... julian_date=pl.col("date_col").xdt.to_julian_date() - ... ) - shape: (2, 2) - ┌─────────────────────┬────────────────────┐ - │ date_col ┆ julian_date │ - │ --- ┆ --- │ - │ datetime[μs] ┆ f64 │ - ╞═════════════════════╪════════════════════╡ - │ 2013-01-01 00:30:00 ┆ 2456293.5208333335 │ - │ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │ - └─────────────────────┴────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - return expr.register_plugin( - lib=lib, - symbol="to_julian_date", - is_elementwise=True, - args=[], - ) - -def ceil( - expr: str | pl.Expr, - every: str | pl.Expr, -) -> pl.Expr: - """Find "ceiling" of datetime. - - Parameters - ---------- - every - Duration string, created with the - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - These strings can be combined: - - - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Returns - ------- - Expr - Expression of data type :class:`Utf8`. - - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], - ... } - ... ) - >>> df.with_columns(result=pl.col("date_col").xdt.ceil("1mo")) - shape: (2, 2) - ┌─────────────────────┬─────────────────────┐ - │ date_col ┆ result │ - │ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] │ - ╞═════════════════════╪═════════════════════╡ - │ 2024-08-24 00:00:00 ┆ 2024-09-01 00:00:00 │ - │ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │ - └─────────────────────┴─────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - truncated = expr.dt.truncate(every) - return ( - pl.when(expr == truncated) - .then(expr) - .otherwise(truncated.dt.offset_by(every)) - ) - -def day_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: - """ - Return day name, in specified locale (if specified). - - Returns - ------- - Expr - Expression of data type :class:`Utf8`. - - See Also - -------- - format_localized : format according to locale. - - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)], - ... } - ... ) - >>> df.with_columns( - ... english_day_name=pl.col("ts").xdt.day_name(), - ... french_day_name=pl.col("ts").xdt.day_name("fr_FR"), - ... ukrainian_day_name=pl.col("ts").xdt.day_name("uk_UA"), - ... ) - shape: (2, 4) - ┌─────────────────────┬──────────────────┬─────────────────┬────────────────────┐ - │ ts ┆ english_day_name ┆ french_day_name ┆ ukrainian_day_name │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ str ┆ str │ - ╞═════════════════════╪══════════════════╪═════════════════╪════════════════════╡ - │ 2020-10-25 00:00:00 ┆ Sunday ┆ dimanche ┆ неділя │ - │ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │ - └─────────────────────┴──────────────────┴─────────────────┴────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - if locale is None: - result = expr.dt.to_string("%A") - else: - result = format_localized(expr, "%A", locale=locale) # type: ignore[attr-defined] - return result - -def month_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: - """ - Return month name, in specified locale (if specified). - - Returns - ------- - Expr - Expression of data type :class:`Utf8`. - - See Also - -------- - format_localized : format according to locale. - - Examples - -------- - >>> from datetime import datetime - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "ts": [datetime(2020, 10, 25), datetime(2020, 11, 26)], - ... } - ... ) - >>> df.with_columns( - ... english_month_name=pl.col("ts").xdt.month_name(), - ... french_month_name=pl.col("ts").xdt.month_name("fr_FR"), - ... ukrainian_month_name=pl.col("ts").xdt.month_name("uk_UA"), - ... ) - shape: (2, 4) - ┌─────────────────────┬────────────────────┬───────────────────┬──────────────────────┐ - │ ts ┆ english_month_name ┆ french_month_name ┆ ukrainian_month_name │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ str ┆ str │ - ╞═════════════════════╪════════════════════╪═══════════════════╪══════════════════════╡ - │ 2020-10-25 00:00:00 ┆ October ┆ octobre ┆ жовтня │ - │ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │ - └─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘ - """ - if isinstance(expr, str): - expr = pl.col(expr) - if locale is None: - result = expr.dt.to_string("%B") - else: - result = format_localized(expr, "%B", locale=locale) - return result - - -class XDTExpr(pl.Expr): - @property - def xdt(self) -> ExprXDTNamespace: - return ExprXDTNamespace(self) - - -class XDTColumn(Protocol): - def __call__( - self, - name: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], - *more_names: str | PolarsDataType, - ) -> XDTExpr: - ... - - def __getattr__(self, name: str) -> pl.Expr: - ... - - @property - def xdt(self) -> ExprXDTNamespace: - ... - - -col = cast(XDTColumn, pl.col) - - -def workday_count( - start: str | pl.Expr | date, - end: str | pl.Expr | date, - weekend: Sequence[str] = ("Sat", "Sun"), - holidays: Sequence[date] | None = None, -) -> pl.Expr: - """Count the number of workdays between two columns of dates. - - Parameters - ---------- - start - Start date(s). This can be a string column, a date column, or a single date. - end - End date(s). This can be a string column, a date column, or a single date. - weekend - The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). - holidays - The holidays to exclude from the calculation. Defaults to None. This should - be a list of ``datetime.date`` s. - - Returns - ------- - polars.Expr - - Examples - -------- - >>> from datetime import date - >>> import polars as pl - >>> import polars_xdt # noqa: F401 - >>> df = pl.DataFrame( - ... { - ... "start": [date(2023, 1, 4), date(2023, 5, 1), date(2023, 9, 9)], - ... "end": [date(2023, 2, 8), date(2023, 5, 2), date(2023, 12, 30)], - ... } - ... ) - >>> df.with_columns( - ... n_business_days=polars_xdt.workday_count("start", "end") - ... ) - shape: (3, 3) - ┌────────────┬────────────┬─────────────────┐ - │ start ┆ end ┆ n_business_days │ - │ --- ┆ --- ┆ --- │ - │ date ┆ date ┆ i32 │ - ╞════════════╪════════════╪═════════════════╡ - │ 2023-01-04 ┆ 2023-02-08 ┆ 25 │ - │ 2023-05-01 ┆ 2023-05-02 ┆ 1 │ - │ 2023-09-09 ┆ 2023-12-30 ┆ 80 │ - └────────────┴────────────┴─────────────────┘ - """ - if isinstance(start, str): - start = col(start) - elif not isinstance(start, pl.Expr): - start = pl.lit(start) - if isinstance(end, str): - end = col(end) - elif not isinstance(end, pl.Expr): - end = pl.lit(end) - - weekmask = get_weekmask(weekend) - if not holidays: - holidays_int = [] - else: - holidays_int = sorted( - {(holiday - date(1970, 1, 1)).days for holiday in holidays}, - ) - return start.register_plugin( - lib=lib, - symbol="sub", - is_elementwise=True, - args=[end], - kwargs={ - "weekmask": weekmask, - "holidays": holidays_int, - }, - ) - - __all__ = [ - "col", + "ceil", + "day_name", "date_range", + "format_localized", + "from_local_datetime", + "is_workday", + "month_name", + "offset_by", + "to_julian_date", + "to_local_datetime", "workday_count", "__version__", ] diff --git a/polars_xdt/functions.py b/polars_xdt/functions.py new file mode 100644 index 0000000..cb00ec4 --- /dev/null +++ b/polars_xdt/functions.py @@ -0,0 +1,703 @@ +from __future__ import annotations + +import re +import sys +from datetime import date +from typing import TYPE_CHECKING, Literal, Sequence + +import polars as pl +from polars.utils._parse_expr_input import parse_as_expression +from polars.utils._wrap import wrap_expr +from polars.utils.udfs import _get_shared_lib_location + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +if TYPE_CHECKING: + from polars.type_aliases import IntoExpr + +RollStrategy: TypeAlias = Literal["raise", "forward", "backward"] + + +lib = _get_shared_lib_location(__file__) + +mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7} +reverse_mapping = {value: key for key, value in mapping.items()} + +if TYPE_CHECKING: + from polars import Expr + from polars.type_aliases import Ambiguous + + +def get_weekmask(weekend: Sequence[str]) -> list[bool]: + if weekend == ("Sat", "Sun"): + weekmask = [True, True, True, True, True, False, False] + else: + weekmask = [reverse_mapping[i] not in weekend for i in range(1, 8)] + if sum(weekmask) == 0: + msg = f"At least one day of the week must be a business day. Got weekend={weekend}" + raise ValueError(msg) + return weekmask + + +def offset_by( + expr: IntoExpr, + by: IntoExpr, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, + roll: RollStrategy = "raise", +) -> pl.Expr: + """Offset this date by a relative time offset. + + Parameters + ---------- + by + The offset to apply. This can be a string of the form "nbd" (where n + is an integer), or a polars expression that evaluates to such a string. + Additional units are passed to `polars.dt.offset_by`. + weekend + The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). + holidays + The holidays to exclude from the calculation. Defaults to None. + roll + How to handle dates that fall on a non-workday. + + - "raise" raise an error (default). + - "forward" roll forward to the next business day. + - "backward" roll backward to the previous business day. + + Returns + ------- + polars.Expr + + Examples + -------- + >>> import polars as pl + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... {"date": [date(2023, 4, 3), date(2023, 9, 1), date(2024, 1, 4)]} + ... ) + >>> df.with_columns( + ... date_shifted=xdt.offset_by("date", "1bd"), + ... ) + shape: (3, 2) + ┌────────────┬──────────────┐ + │ date ┆ date_shifted │ + │ --- ┆ --- │ + │ date ┆ date │ + ╞════════════╪══════════════╡ + │ 2023-04-03 ┆ 2023-04-04 │ + │ 2023-09-01 ┆ 2023-09-04 │ + │ 2024-01-04 ┆ 2024-01-05 │ + └────────────┴──────────────┘ + + You can also specify custom weekends and holidays: + + >>> import holidays + >>> holidays_england = holidays.country_holidays( + ... "UK", subdiv="ENG", years=[2023, 2024] + ... ) + >>> df.with_columns( + ... date_shifted=xdt.offset_by( + ... "date", + ... "5bd", + ... holidays=holidays_england, + ... weekend=["Fri", "Sat"], + ... roll="backward", + ... ), + ... ) + shape: (3, 2) + ┌────────────┬──────────────┐ + │ date ┆ date_shifted │ + │ --- ┆ --- │ + │ date ┆ date │ + ╞════════════╪══════════════╡ + │ 2023-04-03 ┆ 2023-04-11 │ + │ 2023-09-01 ┆ 2023-09-07 │ + │ 2024-01-04 ┆ 2024-01-11 │ + └────────────┴──────────────┘ + + You can also pass expressions to `by`: + + >>> df = pl.DataFrame( + ... { + ... "date": [ + ... date(2023, 4, 3), + ... date(2023, 9, 1), + ... date(2024, 1, 4), + ... ], + ... "by": ["1bd", "2bd", "-3bd"], + ... } + ... ) + >>> df.with_columns(date_shifted=xdt.offset_by("date", pl.col("by"))) + shape: (3, 3) + ┌────────────┬──────┬──────────────┐ + │ date ┆ by ┆ date_shifted │ + │ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ date │ + ╞════════════╪══════╪══════════════╡ + │ 2023-04-03 ┆ 1bd ┆ 2023-04-04 │ + │ 2023-09-01 ┆ 2bd ┆ 2023-09-05 │ + │ 2024-01-04 ┆ -3bd ┆ 2024-01-01 │ + └────────────┴──────┴──────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + if ( + isinstance(by, str) + and (match := re.search(r"(\d+bd)", by)) is not None + and (len(match.group(1)) == len(by)) + ): + # Fast path - do we have a business day offset, and nothing else? + n: int | pl.Expr = int(by[:-2]) + fastpath = True + else: + if not isinstance(by, pl.Expr): + by = pl.lit(by) + n = (by.str.extract(r"^(-?)") + by.str.extract(r"(\d+)bd")).cast( + pl.Int32, + ) + by = by.str.replace(r"(\d+bd)", "") + fastpath = False + + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, + ) + weekmask = get_weekmask(weekend) + + result = expr.register_plugin( + lib=lib, + symbol="advance_n_days", + is_elementwise=True, + args=[n], + kwargs={ + "holidays": holidays_int, + "weekmask": weekmask, + "roll": roll, + }, + ) + if fastpath: + return result + return result.dt.offset_by(by) + + +def is_workday( + expr: IntoExpr, + *, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, +) -> pl.Expr: + """Determine whether a day is a workday. + + Parameters + ---------- + weekend + The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). + holidays + The holidays to exclude from the calculation. Defaults to None. This should + be a list of ``datetime.date`` s. + + Returns + ------- + polars.Expr + + Examples + -------- + >>> from datetime import date + >>> import polars as pl + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "date": [ + ... date(2023, 1, 4), + ... date(2023, 5, 1), + ... date(2023, 9, 9), + ... ], + ... } + ... ) + >>> df.with_columns(is_workday=xdt.is_workday("date")) + shape: (3, 2) + ┌────────────┬────────────┐ + │ date ┆ is_workday │ + │ --- ┆ --- │ + │ date ┆ bool │ + ╞════════════╪════════════╡ + │ 2023-01-04 ┆ true │ + │ 2023-05-01 ┆ true │ + │ 2023-09-09 ┆ false │ + └────────────┴────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + weekmask = get_weekmask(weekend) + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, + ) + return expr.register_plugin( + lib=lib, + symbol="is_workday", + is_elementwise=True, + args=[], + kwargs={ + "weekmask": weekmask, + "holidays": holidays_int, + }, + ) + + +def from_local_datetime( + expr: IntoExpr, + from_tz: str | Expr, + to_tz: str, + ambiguous: Ambiguous = "raise", +) -> pl.Expr: + """Converts from local datetime in given time zone to new timezone. + + Parameters + ---------- + from_tz + Current timezone of each datetime + to_tz + Timezone to convert to + ambiguous + Determine how to deal with ambiguous datetimes: + + - `'raise'` (default): raise + - `'earliest'`: use the earliest datetime + - `'latest'`: use the latest datetime + + Returns + ------- + Expr + Expression of data type :class:`DateTime`. + + Examples + -------- + You can go from a localized datetime back to expressing the datetimes + in a single timezone with `from_local_datetime`. + + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "local_dt": [ + ... datetime(2020, 10, 10, 1), + ... datetime(2020, 10, 10, 2), + ... datetime(2020, 10, 9, 20), + ... ], + ... "timezone": [ + ... "Europe/London", + ... "Africa/Kigali", + ... "America/New_York", + ... ], + ... } + ... ) + >>> df.with_columns( + ... xdt.from_local_datetime( + ... "local_dt", pl.col("timezone"), "UTC" + ... ).alias("date") + ... ) + shape: (3, 3) + ┌─────────────────────┬──────────────────┬─────────────────────────┐ + │ local_dt ┆ timezone ┆ date │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ datetime[μs, UTC] │ + ╞═════════════════════╪══════════════════╪═════════════════════════╡ + │ 2020-10-10 01:00:00 ┆ Europe/London ┆ 2020-10-10 00:00:00 UTC │ + │ 2020-10-10 02:00:00 ┆ Africa/Kigali ┆ 2020-10-10 00:00:00 UTC │ + │ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │ + └─────────────────────┴──────────────────┴─────────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + from_tz = wrap_expr(parse_as_expression(from_tz, str_as_lit=True)) + return expr.register_plugin( + lib=lib, + symbol="from_local_datetime", + is_elementwise=True, + args=[from_tz], + kwargs={ + "to_tz": to_tz, + "ambiguous": ambiguous, + }, + ) + + +def to_local_datetime( + expr: IntoExpr, + time_zone: str | Expr, +) -> pl.Expr: + """Convert to local datetime in given time zone. + + Parameters + ---------- + time_zone + Time zone to convert to. + + Returns + ------- + Expr + Expression of data type :class:`DateTime`. + + Examples + -------- + You can use `to_local_datetime` to figure out how a tz-aware datetime + will be expressed as a local datetime. + + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "date_col": [datetime(2020, 10, 10)] * 3, + ... "timezone": [ + ... "Europe/London", + ... "Africa/Kigali", + ... "America/New_York", + ... ], + ... } + ... ).with_columns(pl.col("date_col").dt.replace_time_zone("UTC")) + >>> df.with_columns( + ... xdt.to_local_datetime("date_col", pl.col("timezone")).alias( + ... "local_dt" + ... ) + ... ) + shape: (3, 3) + ┌─────────────────────────┬──────────────────┬─────────────────────┐ + │ date_col ┆ timezone ┆ local_dt │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs, UTC] ┆ str ┆ datetime[μs] │ + ╞═════════════════════════╪══════════════════╪═════════════════════╡ + │ 2020-10-10 00:00:00 UTC ┆ Europe/London ┆ 2020-10-10 01:00:00 │ + │ 2020-10-10 00:00:00 UTC ┆ Africa/Kigali ┆ 2020-10-10 02:00:00 │ + │ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │ + └─────────────────────────┴──────────────────┴─────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + time_zone = wrap_expr(parse_as_expression(time_zone, str_as_lit=True)) + return expr.register_plugin( + lib=lib, + symbol="to_local_datetime", + is_elementwise=True, + args=[time_zone], + ) + + +def format_localized( + expr: IntoExpr, + format: str, # noqa: A002 + locale: str = "uk_UA", +) -> pl.Expr: + """Convert to local datetime in given time zone. + + Parameters + ---------- + format + Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html + for what's available. + locale + Locale to use for formatting. Defaults to "uk_UA", because that's what the OP + requested https://github.com/pola-rs/polars/issues/12341. + + Returns + ------- + Expr + Expression of data type :class:`Utf8`. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], + ... } + ... ) + >>> df.with_columns( + ... result=xdt.format_localized( + ... "date_col", format="%A, %d %B %Y", locale="uk_UA" + ... ) + ... ) + shape: (2, 2) + ┌─────────────────────┬──────────────────────────┐ + │ date_col ┆ result │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪══════════════════════════╡ + │ 2024-08-24 00:00:00 ┆ субота, 24 серпня 2024 │ + │ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │ + └─────────────────────┴──────────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + return expr.register_plugin( + lib=lib, + symbol="format_localized", + is_elementwise=True, + args=[], + kwargs={"format": format, "locale": locale}, + ) + + +def to_julian_date(expr: str | pl.Expr) -> pl.Expr: + """Return the Julian date corresponding to given datetimes. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "date_col": [ + ... datetime(2013, 1, 1, 0, 30), + ... datetime(2024, 1, 7, 13, 18, 51), + ... ], + ... } + ... ) + >>> with pl.Config(float_precision=10) as cfg: + ... df.with_columns(julian_date=xdt.to_julian_date("date_col")) + shape: (2, 2) + ┌─────────────────────┬────────────────────┐ + │ date_col ┆ julian_date │ + │ --- ┆ --- │ + │ datetime[μs] ┆ f64 │ + ╞═════════════════════╪════════════════════╡ + │ 2013-01-01 00:30:00 ┆ 2456293.5208333335 │ + │ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │ + └─────────────────────┴────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + return expr.register_plugin( + lib=lib, + symbol="to_julian_date", + is_elementwise=True, + args=[], + ) + + +def ceil( + expr: IntoExpr, + every: str | pl.Expr, +) -> pl.Expr: + """Find "ceiling" of datetime. + + Parameters + ---------- + every + Duration string, created with the + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + These strings can be combined: + + - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Returns + ------- + Expr + Expression of data type :class:`Utf8`. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)], + ... } + ... ) + >>> df.with_columns(result=xdt.ceil("date_col", "1mo")) + shape: (2, 2) + ┌─────────────────────┬─────────────────────┐ + │ date_col ┆ result │ + │ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╡ + │ 2024-08-24 00:00:00 ┆ 2024-09-01 00:00:00 │ + │ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │ + └─────────────────────┴─────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + truncated = expr.dt.truncate(every) + return ( + pl.when(expr == truncated) + .then(expr) + .otherwise(truncated.dt.offset_by(every)) + ) + + +def day_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: + """ + Return day name, in specified locale (if specified). + + Returns + ------- + Expr + Expression of data type :class:`Utf8`. + + See Also + -------- + format_localized : format according to locale. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)], + ... } + ... ) + >>> df.with_columns( + ... english_day_name=xdt.day_name("ts"), + ... french_day_name=xdt.day_name("ts", locale="fr_FR"), + ... ukrainian_day_name=xdt.day_name("ts", locale="uk_UA"), + ... ) + shape: (2, 4) + ┌─────────────────────┬──────────────────┬─────────────────┬────────────────────┐ + │ ts ┆ english_day_name ┆ french_day_name ┆ ukrainian_day_name │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ str ┆ str │ + ╞═════════════════════╪══════════════════╪═════════════════╪════════════════════╡ + │ 2020-10-25 00:00:00 ┆ Sunday ┆ dimanche ┆ неділя │ + │ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │ + └─────────────────────┴──────────────────┴─────────────────┴────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + if locale is None: + result = expr.dt.to_string("%A") + else: + result = format_localized(expr, "%A", locale=locale) # type: ignore[attr-defined] + return result + + +def month_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: + """ + Return month name, in specified locale (if specified). + + Returns + ------- + Expr + Expression of data type :class:`Utf8`. + + See Also + -------- + format_localized : format according to locale. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "ts": [datetime(2020, 10, 25), datetime(2020, 11, 26)], + ... } + ... ) + >>> df.with_columns( + ... english_month_name=xdt.month_name("ts"), + ... french_month_name=xdt.month_name("ts", locale="fr_FR"), + ... ukrainian_month_name=xdt.month_name("ts", locale="uk_UA"), + ... ) + shape: (2, 4) + ┌─────────────────────┬────────────────────┬───────────────────┬──────────────────────┐ + │ ts ┆ english_month_name ┆ french_month_name ┆ ukrainian_month_name │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ str ┆ str │ + ╞═════════════════════╪════════════════════╪═══════════════════╪══════════════════════╡ + │ 2020-10-25 00:00:00 ┆ October ┆ octobre ┆ жовтня │ + │ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │ + └─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘ + """ + expr = wrap_expr(parse_as_expression(expr)) + if locale is None: + result = expr.dt.to_string("%B") + else: + result = format_localized(expr, "%B", locale=locale) + return result + + +def workday_count( + start_dates: IntoExpr, + end_dates: IntoExpr, + weekend: Sequence[str] = ("Sat", "Sun"), + holidays: Sequence[date] | None = None, +) -> pl.Expr: + """Count the number of workdays between two columns of dates. + + Parameters + ---------- + start + Start date(s). This can be a string column, a date column, or a single date. + end + End date(s). This can be a string column, a date column, or a single date. + weekend + The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). + holidays + The holidays to exclude from the calculation. Defaults to None. This should + be a list of ``datetime.date`` s. + + Returns + ------- + polars.Expr + + Examples + -------- + >>> from datetime import date + >>> import polars as pl + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... { + ... "start": [date(2023, 1, 4), date(2023, 5, 1), date(2023, 9, 9)], + ... "end": [date(2023, 2, 8), date(2023, 5, 2), date(2023, 12, 30)], + ... } + ... ) + >>> df.with_columns(n_business_days=xdt.workday_count("start", "end")) + shape: (3, 3) + ┌────────────┬────────────┬─────────────────┐ + │ start ┆ end ┆ n_business_days │ + │ --- ┆ --- ┆ --- │ + │ date ┆ date ┆ i32 │ + ╞════════════╪════════════╪═════════════════╡ + │ 2023-01-04 ┆ 2023-02-08 ┆ 25 │ + │ 2023-05-01 ┆ 2023-05-02 ┆ 1 │ + │ 2023-09-09 ┆ 2023-12-30 ┆ 80 │ + └────────────┴────────────┴─────────────────┘ + """ + start_dates = wrap_expr(parse_as_expression(start_dates)) + end_dates = wrap_expr(parse_as_expression(end_dates)) + weekmask = get_weekmask(weekend) + if not holidays: + holidays_int = [] + else: + holidays_int = sorted( + {(holiday - date(1970, 1, 1)).days for holiday in holidays}, + ) + return start_dates.register_plugin( + lib=lib, + symbol="workday_count", + is_elementwise=True, + args=[end_dates], + kwargs={ + "weekmask": weekmask, + "holidays": holidays_int, + }, + ) diff --git a/polars_xdt/namespace.py b/polars_xdt/namespace.py new file mode 100644 index 0000000..4ec3e89 --- /dev/null +++ b/polars_xdt/namespace.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from typing import Any, Callable + +import polars as pl + +from polars_xdt import functions + + +@pl.api.register_expr_namespace("xdt") +class ExprXDTNamespace: + """eXtra stuff for DateTimes.""" + + def __init__(self, expr: pl.Expr) -> None: + self._expr = expr + + def __getattr__(self, function_name: str) -> Callable[[Any], pl.Expr]: + def func(*args: Any, **kwargs: Any) -> pl.Expr: + return getattr(functions, function_name)( + self._expr, *args, **kwargs + ) + + return func diff --git a/src/expressions.rs b/src/expressions.rs index 026dcf8..28df789 100644 --- a/src/expressions.rs +++ b/src/expressions.rs @@ -74,12 +74,12 @@ fn advance_n_days(inputs: &[Series], kwargs: BusinessDayKwargs) -> PolarsResult< } #[polars_expr(output_type=Int32)] -fn sub(inputs: &[Series], kwargs: BusinessDayKwargs) -> PolarsResult { +fn workday_count(inputs: &[Series], kwargs: BusinessDayKwargs) -> PolarsResult { let begin_dates = &inputs[0]; let end_dates = &inputs[1]; let weekmask = kwargs.weekmask; let holidays = kwargs.holidays; - impl_sub(begin_dates, end_dates, &weekmask, holidays) + impl_workday_count(begin_dates, end_dates, &weekmask, holidays) } #[polars_expr(output_type=Boolean)] diff --git a/src/sub.rs b/src/sub.rs index eefc654..c9ced02 100644 --- a/src/sub.rs +++ b/src/sub.rs @@ -48,7 +48,7 @@ fn date_diff( } } -pub(crate) fn impl_sub( +pub(crate) fn impl_workday_count( start_dates: &Series, end_dates: &Series, weekmask: &[bool; 7], diff --git a/tests/ceil_test.py b/tests/ceil_test.py index e9d552e..4aecbe4 100644 --- a/tests/ceil_test.py +++ b/tests/ceil_test.py @@ -2,15 +2,17 @@ import polars as pl import polars_xdt as xdt + def test_ceil() -> None: - df = pl.DataFrame({ - "date_col": [ - datetime(2024, 8, 24, 1, 2, 3, 123456), - datetime(2024, 10, 1), - ], + df = pl.DataFrame( + { + "date_col": [ + datetime(2024, 8, 24, 1, 2, 3, 123456), + datetime(2024, 10, 1), + ], }, schema={"date_col": pl.Datetime("ms")}, ) - result = df.select(result=xdt.ceil('date_col', '1mo'))['result'] + result = df.select(result=xdt.ceil("date_col", "1mo"))["result"] assert result[0] == datetime(2024, 9, 1, 0, 0, 0, 0) assert result[1] == datetime(2024, 10, 1, 0, 0, 0, 0) diff --git a/tests/julian_date_test.py b/tests/julian_date_test.py index 85c7eae..67ede9a 100644 --- a/tests/julian_date_test.py +++ b/tests/julian_date_test.py @@ -9,26 +9,27 @@ import polars_xdt as xdt - - @given( - date=st.datetimes(min_value=dt.datetime(1, 1, 1), max_value=dt.datetime(9999, 12, 31)), + date=st.datetimes( + min_value=dt.datetime(1, 1, 1), max_value=dt.datetime(9999, 12, 31) + ), ) def test_against_pandas( date: dt.date, ) -> None: - df = pl.DataFrame({'a': [date]}, schema={'a': pl.Datetime('ms')}) - result = df.select(xdt.to_julian_date('a'))['a'].item() - expected = pd.Timestamp(df['a'].item()).to_julian_date() + df = pl.DataFrame({"a": [date]}, schema={"a": pl.Datetime("ms")}) + result = df.select(xdt.to_julian_date("a"))["a"].item() + expected = pd.Timestamp(df["a"].item()).to_julian_date() assert result == expected + @given( date=st.dates(min_value=dt.date(1, 1, 1), max_value=dt.date(9999, 12, 31)), ) def test_against_pandas_date( date: dt.date, ) -> None: - df = pl.DataFrame({'a': [date]}) - result = df.select(xdt.to_julian_date('a'))['a'].item() - expected = pd.Timestamp(df['a'].item()).to_julian_date() + df = pl.DataFrame({"a": [date]}) + result = df.select(xdt.to_julian_date("a"))["a"].item() + expected = pd.Timestamp(df["a"].item()).to_julian_date() assert result == expected diff --git a/tests/test_business_offsets.py b/tests/test_business_offsets.py index 32a4a6c..e504329 100644 --- a/tests/test_business_offsets.py +++ b/tests/test_business_offsets.py @@ -22,24 +22,28 @@ def get_result( date: dt.date, dtype: PolarsDataType, by: str | pl.Series, - **kwargs: Mapping[str, Any], + **kwargs: Any, ) -> dt.date: if dtype == pl.Date: result = ( pl.DataFrame({"ts": [date]}) - .select(xdt.offset_by('ts', by=by, **kwargs))["ts"] # type: ignore[arg-type] + .select(xdt.offset_by("ts", by=by, **kwargs))["ts"] # type: ignore[arg-type] .item() ) else: try: result = ( - pl.DataFrame({"ts": [dt.datetime(date.year, date.month, date.day)]}) + pl.DataFrame( + {"ts": [dt.datetime(date.year, date.month, date.day)]} + ) .select( - xdt.offset_by(pl.col("ts") - .dt.cast_time_unit(dtype.time_unit) # type: ignore[union-attr] - .dt.replace_time_zone(dtype.time_zone) # type: ignore[union-attr] - , by=by, **kwargs) - .dt.date() + xdt.offset_by( + pl.col("ts") + .dt.cast_time_unit(dtype.time_unit) # type: ignore[union-attr] + .dt.replace_time_zone(dtype.time_zone), # type: ignore[union-attr] + by=by, + **kwargs, # type: ignore[arg-type] + ).dt.date() )["ts"] .item() ) @@ -50,7 +54,9 @@ def get_result( @given( - date=st.dates(min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31)), + date=st.dates( + min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31) + ), n=st.integers(min_value=-30, max_value=30), weekend=st.lists( st.sampled_from(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]), @@ -59,7 +65,9 @@ def get_result( unique=True, ), holidays=st.lists( - st.dates(min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31)), + st.dates( + min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31) + ), min_size=1, max_size=300, ), @@ -85,7 +93,12 @@ def test_against_np_busday_offset( assume(date not in holidays) roll = "raise" result = get_result( - date, dtype, by=function(f"{n}bd"), weekend=weekend, holidays=holidays, roll=roll # type: ignore[arg-type] + date, + dtype, + by=function(f"{n}bd"), + weekend=weekend, + holidays=holidays, + roll=roll, ) weekmask = [0 if reverse_mapping[i] in weekend else 1 for i in range(1, 8)] expected = np.busday_offset(date, n, weekmask=weekmask, holidays=holidays) @@ -93,7 +106,9 @@ def test_against_np_busday_offset( @given( - date=st.dates(min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31)), + date=st.dates( + min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31) + ), n=st.integers(min_value=-30, max_value=30), weekend=st.lists( st.sampled_from(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]), @@ -102,7 +117,9 @@ def test_against_np_busday_offset( unique=True, ), holidays=st.lists( - st.dates(min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31)), + st.dates( + min_value=dt.date(1969, 1, 1), max_value=dt.date(1971, 12, 31) + ), min_size=1, max_size=300, ), @@ -127,7 +144,12 @@ def test_against_np_busday_offset_with_roll( roll: Literal["forward", "backward"], ) -> None: result = get_result( - date, dtype, by=function(f"{n}bd"), weekend=weekend, holidays=holidays, roll=roll # type: ignore[arg-type] + date, + dtype, + by=function(f"{n}bd"), + weekend=weekend, + holidays=holidays, + roll=roll, # type: ignore[arg-type] ) weekmask = [0 if reverse_mapping[i] in weekend else 1 for i in range(1, 8)] expected = np.busday_offset( @@ -156,7 +178,7 @@ def test_extra_args(by: str, expected: dt.datetime) -> None: df = pl.DataFrame({"dates": [start]}) result = ( df.with_columns( - dates_shifted=xdt.offset_by('dates', by=by) + dates_shifted=xdt.offset_by("dates", by=by) ).with_columns(end_wday=pl.col("dates_shifted").dt.strftime("%a")) )["dates_shifted"].item() assert result == expected @@ -167,7 +189,7 @@ def test_extra_args_w_series() -> None: df = pl.DataFrame({"dates": [start] * 2, "by": ["1bd2h", "-1bd1h"]}) result = ( df.with_columns( - dates_shifted=xdt.offset_by('dates', by=pl.col("by")) + dates_shifted=xdt.offset_by("dates", by=pl.col("by")) ).with_columns(end_wday=pl.col("dates_shifted").dt.strftime("%a")) )["dates_shifted"] assert result[0] == dt.datetime(2000, 1, 4, 2) @@ -181,7 +203,8 @@ def test_starting_on_non_business() -> None: df = pl.DataFrame({"dates": [start]}) with pytest.raises(pl.ComputeError): df.with_columns( - dates_shifted=xdt.offset_by('dates', + dates_shifted=xdt.offset_by( + "dates", by=f"{n}bd", weekend=weekend, ) @@ -192,7 +215,8 @@ def test_starting_on_non_business() -> None: holidays = [start] with pytest.raises(pl.ComputeError): df.with_columns( - dates_shifted=xdt.offset_by('dates', + dates_shifted=xdt.offset_by( + "dates", by=f"{n}bd", holidays=holidays, weekend=weekend, @@ -201,7 +225,10 @@ def test_starting_on_non_business() -> None: def test_within_group_by() -> None: - data = {"a": [1, 2], "date": [dt.datetime(2022, 2, 1), dt.datetime(2023, 2, 1)]} + data = { + "a": [1, 2], + "date": [dt.datetime(2022, 2, 1), dt.datetime(2023, 2, 1)], + } df = pl.DataFrame(data) result = ( @@ -222,7 +249,11 @@ def test_within_group_by() -> None: def test_invalid_roll_strategy() -> None: df = pl.DataFrame( - {"date": pl.date_range(dt.date(2023, 12, 1), dt.date(2023, 12, 5), eager=True)} + { + "date": pl.date_range( + dt.date(2023, 12, 1), dt.date(2023, 12, 5), eager=True + ) + } ) with pytest.raises(pl.ComputeError): - df.with_columns(xdt.offset_by('date', "1bd", roll="cabbage")) # type: ignore[arg-type] + df.with_columns(xdt.offset_by("date", "1bd", roll="cabbage")) # type: ignore[arg-type] diff --git a/tests/test_date_range.py b/tests/test_date_range.py index 509def0..510da86 100644 --- a/tests/test_date_range.py +++ b/tests/test_date_range.py @@ -36,9 +36,9 @@ def test_expr() -> None: date(2023, 1, 10), ], ) - result = pl.select(xdt.date_range(date(2023, 1, 1), date(2023, 1, 10), eager=True))[ - "literal" - ] + result = pl.select( + xdt.date_range(date(2023, 1, 1), date(2023, 1, 10), eager=True) + )["literal"] assert_series_equal(result, expected) diff --git a/tests/test_format_localized.py b/tests/test_format_localized.py index c62d95d..50997b3 100644 --- a/tests/test_format_localized.py +++ b/tests/test_format_localized.py @@ -6,37 +6,66 @@ from polars.type_aliases import TimeUnit import polars_xdt as xdt + @pytest.mark.parametrize( - ('time_unit', 'expected_us', 'expected_ukr'), + ("time_unit", "expected_us", "expected_ukr"), [ - ('ms', 'Wednesday, 01 January 2020 00:00:00.123', 'середа, 01 січня 2020 00:00:00,123'), - ('us','Wednesday, 01 January 2020 00:00:00.123456', 'середа, 01 січня 2020 00:00:00,123456'), - ('ns','Wednesday, 01 January 2020 00:00:00.123456789', 'середа, 01 січня 2020 00:00:00,123456789'), - ] - ) -def test_format_localized_datetime(time_unit: TimeUnit, expected_us: str, expected_ukr: str) -> None: + ( + "ms", + "Wednesday, 01 January 2020 00:00:00.123", + "середа, 01 січня 2020 00:00:00,123", + ), + ( + "us", + "Wednesday, 01 January 2020 00:00:00.123456", + "середа, 01 січня 2020 00:00:00,123456", + ), + ( + "ns", + "Wednesday, 01 January 2020 00:00:00.123456789", + "середа, 01 січня 2020 00:00:00,123456789", + ), + ], +) +def test_format_localized_datetime( + time_unit: TimeUnit, expected_us: str, expected_ukr: str +) -> None: df = pl.DataFrame( { - "date_col": ['2020-01-01T00:00:00.123456789'], + "date_col": ["2020-01-01T00:00:00.123456789"], }, ).select(pl.col("date_col").str.to_datetime(time_unit=time_unit)) - result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %H:%M:%S%.f", 'en_US'))['result'] + result = df.select( + result=xdt.format_localized( + "date_col", "%A, %d %B %Y %H:%M:%S%.f", "en_US" + ) + )["result"] assert result[0] == expected_us - result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %H:%M:%S%.f", 'uk_UA'))['result'] + result = df.select( + result=xdt.format_localized( + "date_col", "%A, %d %B %Y %H:%M:%S%.f", "uk_UA" + ) + )["result"] assert result[0] == expected_ukr + def test_format_localized_date() -> None: df = pl.DataFrame( { "date_col": [date(2024, 8, 24), date(2024, 10, 1)], }, ) - result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y", 'en_US'))['result'] - assert result[0] == 'Saturday, 24 August 2024' - assert result[1] == 'Tuesday, 01 October 2024' - result = df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y", 'uk_UA'))['result'] - assert result[0] == 'субота, 24 серпня 2024' - assert result[1] == 'вівторок, 01 жовтня 2024' + result = df.select( + result=xdt.format_localized("date_col", "%A, %d %B %Y", "en_US") + )["result"] + assert result[0] == "Saturday, 24 August 2024" + assert result[1] == "Tuesday, 01 October 2024" + result = df.select( + result=xdt.format_localized("date_col", "%A, %d %B %Y", "uk_UA") + )["result"] + assert result[0] == "субота, 24 серпня 2024" + assert result[1] == "вівторок, 01 жовтня 2024" + def test_tz_aware() -> None: df = pl.DataFrame( @@ -45,11 +74,14 @@ def test_tz_aware() -> None: }, schema={"date_col": pl.Datetime("ns", "Europe/London")}, ) - result = (df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %z", "uk_UA"))) - assert result['result'][0] == 'субота, 24 серпня 2024 +0100' - assert result['result'][1] == 'вівторок, 01 жовтня 2024 +0100' + result = df.select( + result=xdt.format_localized("date_col", "%A, %d %B %Y %z", "uk_UA") + ) + assert result["result"][0] == "субота, 24 серпня 2024 +0100" + assert result["result"][1] == "вівторок, 01 жовтня 2024 +0100" + -@pytest.mark.parametrize('time_unit', ['ms', 'us', 'ns']) +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) def test_pre_epoch(time_unit: TimeUnit) -> None: df = pl.DataFrame( { @@ -57,5 +89,7 @@ def test_pre_epoch(time_unit: TimeUnit) -> None: }, schema={"date_col": pl.Datetime(time_unit, "Europe/London")}, ) - result = (df.select(result=xdt.format_localized('date_col', "%A, %d %B %Y %z", "en_US"))) - assert result['result'][0] == 'Friday, 01 January 1960 +0000' + result = df.select( + result=xdt.format_localized("date_col", "%A, %d %B %Y %z", "en_US") + ) + assert result["result"][0] == "Friday, 01 January 1960 +0000" diff --git a/tests/test_is_busday.py b/tests/test_is_busday.py index 70b9041..4df5d78 100644 --- a/tests/test_is_busday.py +++ b/tests/test_is_busday.py @@ -20,7 +20,7 @@ def get_result( ) -> int: return ( # type: ignore[no-any-return] pl.DataFrame({"date": [date]}) - .select(xdt.is_workday('date', weekend=weekend, holidays=holidays))[ + .select(xdt.is_workday("date", weekend=weekend, holidays=holidays))[ "date" ] .item() @@ -28,7 +28,9 @@ def get_result( @given( - date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + date=st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), weekend=st.lists( st.sampled_from(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]), min_size=0, @@ -36,7 +38,9 @@ def get_result( unique=True, ), holidays=st.lists( - st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), min_size=1, max_size=300, ), @@ -63,7 +67,9 @@ def test_against_np_is_busday( unique=True, ), holidays=st.lists( - st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), min_size=1, max_size=300, ), diff --git a/tests/test_sub.py b/tests/test_sub.py index 265eeb2..d0fd4ce 100644 --- a/tests/test_sub.py +++ b/tests/test_sub.py @@ -23,14 +23,22 @@ def get_result( ) -> int: return ( # type: ignore[no-any-return] pl.DataFrame({"end_date": [end_date]}) - .select(n=xdt.workday_count(start_date, "end_date", weekend=weekend, holidays=holidays))["n"] # type: ignore[arg-type] + .select( + n=xdt.workday_count( + start_date, "end_date", weekend=weekend, holidays=holidays + ) + )["n"] # type: ignore[arg-type] .item() ) @given( - start_date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), - end_date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + start_date=st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), + end_date=st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), function=st.sampled_from([lambda x: x, lambda x: pl.Series([x])]), weekend=st.lists( st.sampled_from(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]), @@ -39,7 +47,9 @@ def get_result( unique=True, ), holidays=st.lists( - st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), min_size=1, max_size=300, ), @@ -51,9 +61,13 @@ def test_against_np_busday_count( holidays: list[dt.date], function: Callable[[dt.date], dt.date | pl.Series], ) -> None: - result = get_result( function(start_date), end_date, weekend=weekend, holidays=holidays) + result = get_result( + function(start_date), end_date, weekend=weekend, holidays=holidays + ) weekmask = [0 if reverse_mapping[i] in weekend else 1 for i in range(1, 8)] - expected = np.busday_count( start_date, end_date, weekmask=weekmask, holidays=holidays) + expected = np.busday_count( + start_date, end_date, weekmask=weekmask, holidays=holidays + ) if start_date > end_date and tuple( int(v) for v in np.__version__.split(".")[:2] ) < (1, 25): @@ -63,8 +77,12 @@ def test_against_np_busday_count( @given( - start_date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), - end_date=st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + start_date=st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), + end_date=st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), weekend=st.lists( st.sampled_from(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]), min_size=0, @@ -72,7 +90,9 @@ def test_against_np_busday_count( unique=True, ), holidays=st.lists( - st.dates(min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31)), + st.dates( + min_value=dt.date(2000, 1, 1), max_value=dt.date(2000, 12, 31) + ), min_size=1, max_size=300, ), @@ -84,7 +104,9 @@ def test_against_naive_python( holidays: list[dt.date], ) -> None: assume(end_date > start_date) - result = get_result(start_date, end_date, weekend=weekend, holidays=holidays) + result = get_result( + start_date, end_date, weekend=weekend, holidays=holidays + ) expected = 0 start_date_copy = start_date while start_date_copy < end_date: @@ -108,7 +130,10 @@ def test_empty_weekmask() -> None: ) with pytest.raises(ValueError): df.select( - xdt.workday_count("start", "end", weekend=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + xdt.workday_count( + "start", + "end", + weekend=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], ) ) @@ -119,9 +144,7 @@ def test_sub_lit() -> None: "end": [dt.date(2020, 1, 3), dt.date(2020, 1, 5)], } ) - result = df.select( - xdt.workday_count(pl.lit(dt.date(2020, 1, 1)), "end") - ) + result = df.select(xdt.workday_count(pl.lit(dt.date(2020, 1, 1)), "end")) assert result["literal"][0] == 2 assert result["literal"][1] == 3 @@ -136,9 +159,13 @@ def test_workday_count() -> None: result = df.with_columns(workday_count=xdt.workday_count("start", "end")) assert result["workday_count"][0] == 3 assert result["workday_count"][1] == 10 - result = df.with_columns(workday_count=xdt.workday_count("start", dt.date(2020, 1, 8))) + result = df.with_columns( + workday_count=xdt.workday_count("start", dt.date(2020, 1, 8)) + ) assert result["workday_count"][0] == 3 assert result["workday_count"][1] == 2 - result = df.with_columns(workday_count=xdt.workday_count(dt.date(2020, 1, 5), pl.col("end"))) + result = df.with_columns( + workday_count=xdt.workday_count(dt.date(2020, 1, 5), pl.col("end")) + ) assert result["workday_count"][0] == 2 assert result["workday_count"][1] == 10 diff --git a/tests/test_timezone.py b/tests/test_timezone.py index 1fb54e0..f72c1ec 100644 --- a/tests/test_timezone.py +++ b/tests/test_timezone.py @@ -39,7 +39,7 @@ def test_convert_tz_to_local_datetime( expected = df.with_columns(pl.lit(local_date).alias("local_dt")) result = df.with_columns( - xdt.to_local_datetime('date', pl.col("timezone")).alias("local_dt") + xdt.to_local_datetime("date", pl.col("timezone")).alias("local_dt") ) assert_frame_equal(result, expected) @@ -75,8 +75,9 @@ def test_convert_tz_from_local_datetime( ) result = df.with_columns( - xdt.from_local_datetime('local_date', pl.col("timezone"), "Europe/London") - .alias("date") + xdt.from_local_datetime( + "local_date", pl.col("timezone"), "Europe/London" + ).alias("date") ) assert_frame_equal(result, expected) @@ -92,8 +93,9 @@ def test_convert_tz_from_local_datetime_literal() -> None: ) result = df.with_columns( - xdt.from_local_datetime('local_date', "America/New_York", "Europe/London") - .alias("date") + xdt.from_local_datetime( + "local_date", "America/New_York", "Europe/London" + ).alias("date") ) assert_frame_equal(result, expected) @@ -103,7 +105,9 @@ def test_convert_tz_to_local_datetime_literal() -> None: {"date": [datetime(2020, 10, 15, tzinfo=timezone.utc)]} ).with_columns(pl.col("date").dt.convert_time_zone("Europe/London")) - expected = df.with_columns(pl.lit(datetime(2020, 10, 14, 20, 0)).alias("local_dt")) + expected = df.with_columns( + pl.lit(datetime(2020, 10, 14, 20, 0)).alias("local_dt") + ) result = df.with_columns( xdt.to_local_datetime("date", "America/New_York").alias("local_dt") From 49af4c7c8452931fd7adec1198be7af1e5634b15 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:56:26 +0000 Subject: [PATCH 4/8] add readme note --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c02c2c8..5ccad90 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,9 @@ shape: (3, 2) │ 2024-01-04 ┆ 2024-01-11 │ └────────────┴──────────────┘ ``` +Note that `polars-xdt` also registers a `xdt` namespace in the `Expression` class, so you +could equivalently write the above using `pl.col('date').xdt.offset_by('5bd')` (but note +that then type-checking would not recognise the `xdt` attribute). Read the [documentation](https://marcogorelli.github.io/polars-xdt-docs/) for more examples! From 066080a8dda1934050c47c592904162edce1e1a2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:58:14 +0000 Subject: [PATCH 5/8] move namespace into __init__ --- polars_xdt/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 717456b..9099b37 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +import polars_xdt.namespace # noqa: F401 from polars_xdt.functions import ( ceil, day_name, From f2a8b1cfd3695e5cdeb4eeecabe42299adb6db10 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 12:03:10 +0000 Subject: [PATCH 6/8] only use polars public API --- polars_xdt/__init__.py | 1 + polars_xdt/functions.py | 29 ++++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 9099b37..c089dc3 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -14,6 +14,7 @@ workday_count, ) from polars_xdt.ranges import date_range +from polars_xdt.utils import parse_into_expr from ._internal import __version__ diff --git a/polars_xdt/functions.py b/polars_xdt/functions.py index cb00ec4..1181fb3 100644 --- a/polars_xdt/functions.py +++ b/polars_xdt/functions.py @@ -6,9 +6,8 @@ from typing import TYPE_CHECKING, Literal, Sequence import polars as pl -from polars.utils._parse_expr_input import parse_as_expression -from polars.utils._wrap import wrap_expr from polars.utils.udfs import _get_shared_lib_location +from polars_xdt.utils import parse_into_expr if sys.version_info >= (3, 10): from typing import TypeAlias @@ -144,7 +143,7 @@ def offset_by( │ 2024-01-04 ┆ -3bd ┆ 2024-01-01 │ └────────────┴──────┴──────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) if ( isinstance(by, str) and (match := re.search(r"(\d+bd)", by)) is not None @@ -232,7 +231,7 @@ def is_workday( │ 2023-09-09 ┆ false │ └────────────┴────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) weekmask = get_weekmask(weekend) if not holidays: holidays_int = [] @@ -315,8 +314,8 @@ def from_local_datetime( │ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │ └─────────────────────┴──────────────────┴─────────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) - from_tz = wrap_expr(parse_as_expression(from_tz, str_as_lit=True)) + expr = parse_into_expr(expr) + from_tz = parse_into_expr(from_tz, str_as_lit=True) return expr.register_plugin( lib=lib, symbol="from_local_datetime", @@ -378,8 +377,8 @@ def to_local_datetime( │ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │ └─────────────────────────┴──────────────────┴─────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) - time_zone = wrap_expr(parse_as_expression(time_zone, str_as_lit=True)) + expr = parse_into_expr(expr) + time_zone = parse_into_expr(time_zone, str_as_lit=True) return expr.register_plugin( lib=lib, symbol="to_local_datetime", @@ -433,7 +432,7 @@ def format_localized( │ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │ └─────────────────────┴──────────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) return expr.register_plugin( lib=lib, symbol="format_localized", @@ -470,7 +469,7 @@ def to_julian_date(expr: str | pl.Expr) -> pl.Expr: │ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │ └─────────────────────┴────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) return expr.register_plugin( lib=lib, symbol="to_julian_date", @@ -536,7 +535,7 @@ def ceil( │ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │ └─────────────────────┴─────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) truncated = expr.dt.truncate(every) return ( pl.when(expr == truncated) @@ -582,7 +581,7 @@ def day_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: │ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │ └─────────────────────┴──────────────────┴─────────────────┴────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) if locale is None: result = expr.dt.to_string("%A") else: @@ -627,7 +626,7 @@ def month_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr: │ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │ └─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘ """ - expr = wrap_expr(parse_as_expression(expr)) + expr = parse_into_expr(expr) if locale is None: result = expr.dt.to_string("%B") else: @@ -682,8 +681,8 @@ def workday_count( │ 2023-09-09 ┆ 2023-12-30 ┆ 80 │ └────────────┴────────────┴─────────────────┘ """ - start_dates = wrap_expr(parse_as_expression(start_dates)) - end_dates = wrap_expr(parse_as_expression(end_dates)) + start_dates = parse_into_expr(start_dates) + end_dates = parse_into_expr(end_dates) weekmask = get_weekmask(weekend) if not holidays: holidays_int = [] From 3ff2f50f154921295cad810f6966a56f8c5263c4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 12:19:15 +0000 Subject: [PATCH 7/8] fixup docs --- docs/api/polars_xdt.ExprXDTNamespace.ceil.rst | 6 - ..._xdt.ExprXDTNamespace.format_localized.rst | 6 - ...t.ExprXDTNamespace.from_local_datetime.rst | 6 - ...polars_xdt.ExprXDTNamespace.is_workday.rst | 6 - .../polars_xdt.ExprXDTNamespace.offset_by.rst | 6 - ...xdt.ExprXDTNamespace.to_local_datetime.rst | 6 - docs/tutorial.rst | 16 +- polars_xdt/__init__.py | 1 - polars_xdt/functions.py | 207 ++++++++++-------- polars_xdt/ranges.py | 3 +- pyproject.toml | 8 +- 11 files changed, 130 insertions(+), 141 deletions(-) delete mode 100644 docs/api/polars_xdt.ExprXDTNamespace.ceil.rst delete mode 100644 docs/api/polars_xdt.ExprXDTNamespace.format_localized.rst delete mode 100644 docs/api/polars_xdt.ExprXDTNamespace.from_local_datetime.rst delete mode 100644 docs/api/polars_xdt.ExprXDTNamespace.is_workday.rst delete mode 100644 docs/api/polars_xdt.ExprXDTNamespace.offset_by.rst delete mode 100644 docs/api/polars_xdt.ExprXDTNamespace.to_local_datetime.rst diff --git a/docs/api/polars_xdt.ExprXDTNamespace.ceil.rst b/docs/api/polars_xdt.ExprXDTNamespace.ceil.rst deleted file mode 100644 index 64cc014..0000000 --- a/docs/api/polars_xdt.ExprXDTNamespace.ceil.rst +++ /dev/null @@ -1,6 +0,0 @@ -polars\_xdt.ExprXDTNamespace.ceil -================================= - -.. currentmodule:: polars_xdt - -.. automethod:: ExprXDTNamespace.ceil \ No newline at end of file diff --git a/docs/api/polars_xdt.ExprXDTNamespace.format_localized.rst b/docs/api/polars_xdt.ExprXDTNamespace.format_localized.rst deleted file mode 100644 index 5359c12..0000000 --- a/docs/api/polars_xdt.ExprXDTNamespace.format_localized.rst +++ /dev/null @@ -1,6 +0,0 @@ -polars\_xdt.ExprXDTNamespace.format\_localized -============================================== - -.. currentmodule:: polars_xdt - -.. automethod:: ExprXDTNamespace.format_localized \ No newline at end of file diff --git a/docs/api/polars_xdt.ExprXDTNamespace.from_local_datetime.rst b/docs/api/polars_xdt.ExprXDTNamespace.from_local_datetime.rst deleted file mode 100644 index c611900..0000000 --- a/docs/api/polars_xdt.ExprXDTNamespace.from_local_datetime.rst +++ /dev/null @@ -1,6 +0,0 @@ -polars\_xdt.ExprXDTNamespace.from\_local\_datetime -================================================== - -.. currentmodule:: polars_xdt - -.. automethod:: ExprXDTNamespace.from_local_datetime \ No newline at end of file diff --git a/docs/api/polars_xdt.ExprXDTNamespace.is_workday.rst b/docs/api/polars_xdt.ExprXDTNamespace.is_workday.rst deleted file mode 100644 index 26e887c..0000000 --- a/docs/api/polars_xdt.ExprXDTNamespace.is_workday.rst +++ /dev/null @@ -1,6 +0,0 @@ -polars\_xdt.ExprXDTNamespace.is\_workday -======================================== - -.. currentmodule:: polars_xdt - -.. automethod:: ExprXDTNamespace.is_workday \ No newline at end of file diff --git a/docs/api/polars_xdt.ExprXDTNamespace.offset_by.rst b/docs/api/polars_xdt.ExprXDTNamespace.offset_by.rst deleted file mode 100644 index 2d75c53..0000000 --- a/docs/api/polars_xdt.ExprXDTNamespace.offset_by.rst +++ /dev/null @@ -1,6 +0,0 @@ -polars\_xdt.ExprXDTNamespace.offset\_by -======================================= - -.. currentmodule:: polars_xdt - -.. automethod:: ExprXDTNamespace.offset_by \ No newline at end of file diff --git a/docs/api/polars_xdt.ExprXDTNamespace.to_local_datetime.rst b/docs/api/polars_xdt.ExprXDTNamespace.to_local_datetime.rst deleted file mode 100644 index 8aa8dbf..0000000 --- a/docs/api/polars_xdt.ExprXDTNamespace.to_local_datetime.rst +++ /dev/null @@ -1,6 +0,0 @@ -polars\_xdt.ExprXDTNamespace.to\_local\_datetime -================================================ - -.. currentmodule:: polars_xdt - -.. automethod:: ExprXDTNamespace.to_local_datetime \ No newline at end of file diff --git a/docs/tutorial.rst b/docs/tutorial.rst index db99482..3f8bc63 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -8,7 +8,7 @@ Say we start with from datetime import date import polars as pl - import polars_xdt # noqa: F401 + import polars_xdt as xdt df = pl.DataFrame( @@ -21,10 +21,7 @@ Let's shift `Date` forwards by 5 days, excluding Saturday and Sunday: .. code-block:: python result = df.with_columns( - date_shifted=pl.col("date").xdt.offset_by( - '5bd', - weekend=('Sat', 'Sun'), - ) + date_shifted=xdt.offset_by('date', '5bd', weekend=('Sat', 'Sun')) ) print(result) @@ -52,10 +49,11 @@ for 2023 and 2024 (note: you'll need to install the england_holidays = holidays.country_holidays("UK", subdiv='ENG', years=[2023, 2024]) result = df.with_columns( - date_shifted=pl.col("date").xdt.offset_by( - by='5bd', - weekend=('Sat', 'Sun'), - holidays=england_holidays, + date_shifted=xdt.offset_by( + 'date', + by='5bd', + weekend=('Sat', 'Sun'), + holidays=england_holidays, ) ) print(result) diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index c089dc3..9099b37 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -14,7 +14,6 @@ workday_count, ) from polars_xdt.ranges import date_range -from polars_xdt.utils import parse_into_expr from ._internal import __version__ diff --git a/polars_xdt/functions.py b/polars_xdt/functions.py index 1181fb3..8b5edf8 100644 --- a/polars_xdt/functions.py +++ b/polars_xdt/functions.py @@ -7,6 +7,7 @@ import polars as pl from polars.utils.udfs import _get_shared_lib_location + from polars_xdt.utils import parse_into_expr if sys.version_info >= (3, 10): @@ -49,99 +50,102 @@ def offset_by( holidays: Sequence[date] | None = None, roll: RollStrategy = "raise", ) -> pl.Expr: - """Offset this date by a relative time offset. + """ + Offset this date by a relative time offset. Parameters ---------- + expr + Expression to offset by relative time offset. by The offset to apply. This can be a string of the form "nbd" (where n is an integer), or a polars expression that evaluates to such a string. Additional units are passed to `polars.dt.offset_by`. weekend The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). - holidays - The holidays to exclude from the calculation. Defaults to None. - roll - How to handle dates that fall on a non-workday. - - - "raise" raise an error (default). - - "forward" roll forward to the next business day. - - "backward" roll backward to the previous business day. - - Returns - ------- - polars.Expr - - Examples - -------- - >>> import polars as pl - >>> import polars_xdt as xdt - >>> df = pl.DataFrame( - ... {"date": [date(2023, 4, 3), date(2023, 9, 1), date(2024, 1, 4)]} - ... ) - >>> df.with_columns( - ... date_shifted=xdt.offset_by("date", "1bd"), - ... ) - shape: (3, 2) - ┌────────────┬──────────────┐ - │ date ┆ date_shifted │ - │ --- ┆ --- │ - │ date ┆ date │ - ╞════════════╪══════════════╡ - │ 2023-04-03 ┆ 2023-04-04 │ - │ 2023-09-01 ┆ 2023-09-04 │ - │ 2024-01-04 ┆ 2024-01-05 │ - └────────────┴──────────────┘ - - You can also specify custom weekends and holidays: - - >>> import holidays - >>> holidays_england = holidays.country_holidays( - ... "UK", subdiv="ENG", years=[2023, 2024] - ... ) - >>> df.with_columns( - ... date_shifted=xdt.offset_by( - ... "date", - ... "5bd", - ... holidays=holidays_england, - ... weekend=["Fri", "Sat"], - ... roll="backward", - ... ), - ... ) - shape: (3, 2) - ┌────────────┬──────────────┐ - │ date ┆ date_shifted │ - │ --- ┆ --- │ - │ date ┆ date │ - ╞════════════╪══════════════╡ - │ 2023-04-03 ┆ 2023-04-11 │ - │ 2023-09-01 ┆ 2023-09-07 │ - │ 2024-01-04 ┆ 2024-01-11 │ - └────────────┴──────────────┘ - - You can also pass expressions to `by`: - - >>> df = pl.DataFrame( - ... { - ... "date": [ - ... date(2023, 4, 3), - ... date(2023, 9, 1), - ... date(2024, 1, 4), - ... ], - ... "by": ["1bd", "2bd", "-3bd"], - ... } - ... ) - >>> df.with_columns(date_shifted=xdt.offset_by("date", pl.col("by"))) - shape: (3, 3) - ┌────────────┬──────┬──────────────┐ - │ date ┆ by ┆ date_shifted │ - │ --- ┆ --- ┆ --- │ - │ date ┆ str ┆ date │ - ╞════════════╪══════╪══════════════╡ - │ 2023-04-03 ┆ 1bd ┆ 2023-04-04 │ - │ 2023-09-01 ┆ 2bd ┆ 2023-09-05 │ - │ 2024-01-04 ┆ -3bd ┆ 2024-01-01 │ - └────────────┴──────┴──────────────┘ + holidays + The holidays to exclude from the calculation. Defaults to None. + roll + How to handle dates that fall on a non-workday. + + - "raise" raise an error (default). + - "forward" roll forward to the next business day. + - "backward" roll backward to the previous business day. + + Returns + ------- + polars.Expr + + Examples + -------- + >>> import polars as pl + >>> import polars_xdt as xdt + >>> df = pl.DataFrame( + ... {"date": [date(2023, 4, 3), date(2023, 9, 1), date(2024, 1, 4)]} + ... ) + >>> df.with_columns( + ... date_shifted=xdt.offset_by("date", "1bd"), + ... ) + shape: (3, 2) + ┌────────────┬──────────────┐ + │ date ┆ date_shifted │ + │ --- ┆ --- │ + │ date ┆ date │ + ╞════════════╪══════════════╡ + │ 2023-04-03 ┆ 2023-04-04 │ + │ 2023-09-01 ┆ 2023-09-04 │ + │ 2024-01-04 ┆ 2024-01-05 │ + └────────────┴──────────────┘ + + You can also specify custom weekends and holidays: + + >>> import holidays + >>> holidays_england = holidays.country_holidays( + ... "UK", subdiv="ENG", years=[2023, 2024] + ... ) + >>> df.with_columns( + ... date_shifted=xdt.offset_by( + ... "date", + ... "5bd", + ... holidays=holidays_england, + ... weekend=["Fri", "Sat"], + ... roll="backward", + ... ), + ... ) + shape: (3, 2) + ┌────────────┬──────────────┐ + │ date ┆ date_shifted │ + │ --- ┆ --- │ + │ date ┆ date │ + ╞════════════╪══════════════╡ + │ 2023-04-03 ┆ 2023-04-11 │ + │ 2023-09-01 ┆ 2023-09-07 │ + │ 2024-01-04 ┆ 2024-01-11 │ + └────────────┴──────────────┘ + + You can also pass expressions to `by`: + + >>> df = pl.DataFrame( + ... { + ... "date": [ + ... date(2023, 4, 3), + ... date(2023, 9, 1), + ... date(2024, 1, 4), + ... ], + ... "by": ["1bd", "2bd", "-3bd"], + ... } + ... ) + >>> df.with_columns(date_shifted=xdt.offset_by("date", pl.col("by"))) + shape: (3, 3) + ┌────────────┬──────┬──────────────┐ + │ date ┆ by ┆ date_shifted │ + │ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ date │ + ╞════════════╪══════╪══════════════╡ + │ 2023-04-03 ┆ 1bd ┆ 2023-04-04 │ + │ 2023-09-01 ┆ 2bd ┆ 2023-09-05 │ + │ 2024-01-04 ┆ -3bd ┆ 2024-01-01 │ + └────────────┴──────┴──────────────┘ """ expr = parse_into_expr(expr) if ( @@ -191,10 +195,13 @@ def is_workday( weekend: Sequence[str] = ("Sat", "Sun"), holidays: Sequence[date] | None = None, ) -> pl.Expr: - """Determine whether a day is a workday. + """ + Determine whether a day is a workday. Parameters ---------- + expr + Input expression. weekend The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). holidays @@ -257,10 +264,13 @@ def from_local_datetime( to_tz: str, ambiguous: Ambiguous = "raise", ) -> pl.Expr: - """Converts from local datetime in given time zone to new timezone. + """ + Convert from local datetime in given time zone to new timezone. Parameters ---------- + expr + Expression to convert. from_tz Current timezone of each datetime to_tz @@ -332,10 +342,13 @@ def to_local_datetime( expr: IntoExpr, time_zone: str | Expr, ) -> pl.Expr: - """Convert to local datetime in given time zone. + """ + Convert to local datetime in given time zone. Parameters ---------- + expr + Expression to convert. time_zone Time zone to convert to. @@ -392,10 +405,13 @@ def format_localized( format: str, # noqa: A002 locale: str = "uk_UA", ) -> pl.Expr: - """Convert to local datetime in given time zone. + """ + Convert to local datetime in given time zone. Parameters ---------- + expr + Expression to format. format Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html for what's available. @@ -443,7 +459,8 @@ def format_localized( def to_julian_date(expr: str | pl.Expr) -> pl.Expr: - """Return the Julian date corresponding to given datetimes. + """ + Return the Julian date corresponding to given datetimes. Examples -------- @@ -482,10 +499,13 @@ def ceil( expr: IntoExpr, every: str | pl.Expr, ) -> pl.Expr: - """Find "ceiling" of datetime. + """ + Find "ceiling" of datetime. Parameters ---------- + expr + Expression to take "ceiling" of. every Duration string, created with the the following string language: @@ -640,13 +660,14 @@ def workday_count( weekend: Sequence[str] = ("Sat", "Sun"), holidays: Sequence[date] | None = None, ) -> pl.Expr: - """Count the number of workdays between two columns of dates. + """ + Count the number of workdays between two columns of dates. Parameters ---------- - start + start_dates Start date(s). This can be a string column, a date column, or a single date. - end + end_dates End date(s). This can be a string column, a date column, or a single date. weekend The days of the week that are considered weekends. Defaults to ("Sat", "Sun"). diff --git a/polars_xdt/ranges.py b/polars_xdt/ranges.py index fee8530..1192b75 100644 --- a/polars_xdt/ranges.py +++ b/polars_xdt/ranges.py @@ -73,7 +73,8 @@ def date_range( # noqa: PLR0913 weekend: Sequence[str] = ("Sat", "Sun"), holidays: Sequence[date] | None = None, ) -> pl.Series | pl.Expr: - """Create a range of dates with a given interval and filter out weekends and holidays. + """ + Create a range of dates with a given interval and filter out weekends and holidays. Parameters ---------- diff --git a/pyproject.toml b/pyproject.toml index 3620ef4..d0c822e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,13 @@ ignore = [ 'ARG003', # todo: enable 'C901', 'COM812', - 'D', + 'D100', + 'D103', + 'D104', + 'D105', + 'D107', + 'D203', + 'D212', 'DTZ', 'E501', 'FBT003', # todo: enable From e076b1692ff1784fb122d3cf9e47e6885e67f5d9 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 22 Jan 2024 14:34:43 +0000 Subject: [PATCH 8/8] dont forget the utils file --- polars_xdt/utils.py | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 polars_xdt/utils.py diff --git a/polars_xdt/utils.py b/polars_xdt/utils.py new file mode 100644 index 0000000..73f7a19 --- /dev/null +++ b/polars_xdt/utils.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +if TYPE_CHECKING: + from polars.type_aliases import IntoExpr, PolarsDataType + + +def parse_into_expr( + expr: IntoExpr, + *, + str_as_lit: bool = False, + list_as_lit: bool = True, + dtype: PolarsDataType | None = None, +) -> pl.Expr: + """ + Parse a single input into an expression. + + Parameters + ---------- + expr + The input to be parsed as an expression. + str_as_lit + Interpret string input as a string literal. If set to `False` (default), + strings are parsed as column names. + list_as_lit + Interpret list input as a lit literal, If set to `False`, + lists are parsed as `Series` literals. + dtype + If the input is expected to resolve to a literal with a known dtype, pass + this to the `lit` constructor. + + Returns + ------- + polars.Expr + """ + if isinstance(expr, pl.Expr): + pass + elif isinstance(expr, str) and not str_as_lit: + expr = pl.col(expr) + elif isinstance(expr, list) and not list_as_lit: + expr = pl.lit(pl.Series(expr), dtype=dtype) + else: + expr = pl.lit(expr, dtype=dtype) + + return expr