diff --git a/docs/API.rst b/docs/API.rst index 15bded3..556c767 100644 --- a/docs/API.rst +++ b/docs/API.rst @@ -6,7 +6,9 @@ API polars_xdt.date_range polars_xdt.workday_count + polars_xdt.ExprXDTNamespace.base_utc_offset polars_xdt.ExprXDTNamespace.ceil + polars_xdt.ExprXDTNamespace.dst_offset polars_xdt.ExprXDTNamespace.format_localized polars_xdt.ExprXDTNamespace.from_local_datetime polars_xdt.ExprXDTNamespace.is_workday diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 702b1eb..6bc993e 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -578,6 +578,100 @@ def ceil( ) return cast(XDTExpr, result) + def base_utc_offset(self) -> XDTExpr: + """ + Base offset from UTC. + + This is usually constant for all datetimes in a given time zone, but + may vary in the rare case that a country switches time zone, like + Samoa (Apia) did at the end of 2011. + + Returns + ------- + Expr + Expression of data type :class:`Duration`. + + See Also + -------- + Expr.dt.dst_offset : Daylight savings offset from UTC. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "ts": [datetime(2011, 12, 29), datetime(2012, 1, 1)], + ... } + ... ) + >>> df = df.with_columns( + ... pl.col("ts").dt.replace_time_zone("Pacific/Apia") + ... ) + >>> df.with_columns( + ... pl.col("ts").xdt.base_utc_offset().alias("base_utc_offset") + ... ) + shape: (2, 2) + ┌────────────────────────────┬─────────────────┐ + │ ts ┆ base_utc_offset │ + │ --- ┆ --- │ + │ datetime[μs, Pacific/Apia] ┆ duration[ms] │ + ╞════════════════════════════╪═════════════════╡ + │ 2011-12-29 00:00:00 -10 ┆ -11h │ + │ 2012-01-01 00:00:00 +14 ┆ 13h │ + └────────────────────────────┴─────────────────┘ + """ + result = self._expr.register_plugin( + lib=lib, + symbol="base_utc_offset", + is_elementwise=True, + args=[], + ) + return cast(XDTExpr, result) + + def dst_offset(self) -> XDTExpr: + """ + Additional offset currently in effect (typically due to daylight saving time). + + Returns + ------- + Expr + Expression of data type :class:`Duration`. + + See Also + -------- + Expr.dt.base_utc_offset : Base offset from UTC. + + Examples + -------- + >>> from datetime import datetime + >>> import polars_xdt # noqa: F401 + >>> df = pl.DataFrame( + ... { + ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)], + ... } + ... ) + >>> df = df.with_columns( + ... pl.col("ts").dt.replace_time_zone("Europe/London") + ... ) + >>> df.with_columns(pl.col("ts").xdt.dst_offset().alias("dst_offset")) + shape: (2, 2) + ┌─────────────────────────────┬──────────────┐ + │ ts ┆ dst_offset │ + │ --- ┆ --- │ + │ datetime[μs, Europe/London] ┆ duration[ms] │ + ╞═════════════════════════════╪══════════════╡ + │ 2020-10-25 00:00:00 BST ┆ 1h │ + │ 2020-10-26 00:00:00 GMT ┆ 0ms │ + └─────────────────────────────┴──────────────┘ + """ + result = self._expr.register_plugin( + lib=lib, + symbol="dst_offset", + is_elementwise=True, + args=[], + ) + return cast(XDTExpr, result) + class XDTExpr(pl.Expr): @property diff --git a/src/expressions.rs b/src/expressions.rs index e7a7274..026dcf8 100644 --- a/src/expressions.rs +++ b/src/expressions.rs @@ -5,9 +5,12 @@ use crate::is_workday::*; use crate::sub::*; use crate::timezone::*; use crate::to_julian::*; +use crate::utc_offsets::*; +use chrono_tz::Tz; use polars::prelude::*; use pyo3_polars::derive::polars_expr; use serde::Deserialize; +use std::str::FromStr; #[derive(Deserialize)] pub struct BusinessDayKwargs { holidays: Vec, @@ -26,10 +29,16 @@ pub struct FormatLocalizedKwargs { locale: String, } -fn bday_output(input_fields: &[Field]) -> PolarsResult { +fn same_output(input_fields: &[Field]) -> PolarsResult { let field = input_fields[0].clone(); Ok(field) } +fn duration_ms(input_fields: &[Field]) -> PolarsResult { + Ok(Field::new( + input_fields[0].name(), + DataType::Duration(TimeUnit::Milliseconds), + )) +} pub fn to_local_datetime_output(input_fields: &[Field]) -> PolarsResult { let field = input_fields[0].clone(); @@ -53,7 +62,7 @@ pub fn from_local_datetime_output(input_fields: &[Field]) -> PolarsResult Ok(Field::new(&field.name, dtype)) } -#[polars_expr(output_type_func=bday_output)] +#[polars_expr(output_type_func=same_output)] fn advance_n_days(inputs: &[Series], kwargs: BusinessDayKwargs) -> PolarsResult { let s = &inputs[0]; let n = &inputs[1].cast(&DataType::Int32)?; @@ -110,3 +119,27 @@ fn to_julian_date(inputs: &[Series]) -> PolarsResult { let s = &inputs[0]; impl_to_julian_date(s) } + +#[polars_expr(output_type_func=duration_ms)] +fn base_utc_offset(inputs: &[Series]) -> PolarsResult { + let s = &inputs[0]; + match s.dtype() { + DataType::Datetime(time_unit, Some(time_zone)) => { + let time_zone = Tz::from_str(time_zone).unwrap(); + Ok(impl_base_utc_offset(s.datetime()?, time_unit, &time_zone).into_series()) + } + _ => polars_bail!(InvalidOperation: "base_utc_offset only works on Datetime type."), + } +} + +#[polars_expr(output_type_func=duration_ms)] +fn dst_offset(inputs: &[Series]) -> PolarsResult { + let s = &inputs[0]; + match s.dtype() { + DataType::Datetime(time_unit, Some(time_zone)) => { + let time_zone = Tz::from_str(time_zone).unwrap(); + Ok(impl_dst_offset(s.datetime()?, time_unit, &time_zone).into_series()) + } + _ => polars_bail!(InvalidOperation: "base_utc_offset only works on Datetime type."), + } +} diff --git a/src/lib.rs b/src/lib.rs index 0cc45cc..5698d4f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ mod is_workday; mod sub; mod timezone; mod to_julian; +mod utc_offsets; use pyo3::types::PyModule; use pyo3::{pymodule, PyResult, Python}; diff --git a/src/utc_offsets.rs b/src/utc_offsets.rs new file mode 100644 index 0000000..59cf1fc --- /dev/null +++ b/src/utc_offsets.rs @@ -0,0 +1,43 @@ +use chrono::TimeZone; +use chrono_tz::OffsetComponents; +use chrono_tz::Tz; +use polars::prelude::*; +use polars_arrow::temporal_conversions::{ + timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_us_to_datetime, +}; + +pub(crate) fn impl_base_utc_offset( + ca: &DatetimeChunked, + time_unit: &TimeUnit, + time_zone: &Tz, +) -> DurationChunked { + let timestamp_to_datetime = match time_unit { + TimeUnit::Nanoseconds => timestamp_ns_to_datetime, + TimeUnit::Microseconds => timestamp_us_to_datetime, + TimeUnit::Milliseconds => timestamp_ms_to_datetime, + }; + ca.0.apply_values(|t| { + let ndt = timestamp_to_datetime(t); + let dt = time_zone.from_utc_datetime(&ndt); + dt.offset().base_utc_offset().num_milliseconds() + }) + .into_duration(TimeUnit::Milliseconds) +} + +pub(crate) fn impl_dst_offset( + ca: &DatetimeChunked, + time_unit: &TimeUnit, + time_zone: &Tz, +) -> DurationChunked { + let timestamp_to_datetime = match time_unit { + TimeUnit::Nanoseconds => timestamp_ns_to_datetime, + TimeUnit::Microseconds => timestamp_us_to_datetime, + TimeUnit::Milliseconds => timestamp_ms_to_datetime, + }; + ca.0.apply_values(|t| { + let ndt = timestamp_to_datetime(t); + let dt = time_zone.from_utc_datetime(&ndt); + dt.offset().dst_offset().num_milliseconds() + }) + .into_duration(TimeUnit::Milliseconds) +} diff --git a/tests/offsets_test.py b/tests/offsets_test.py new file mode 100644 index 0000000..53fd9e5 --- /dev/null +++ b/tests/offsets_test.py @@ -0,0 +1,101 @@ +from __future__ import annotations +import pytest +import polars as pl +import polars_xdt as xdt +from datetime import datetime + +from typing import TYPE_CHECKING +from polars.testing import assert_series_equal +from polars.exceptions import ComputeError +if TYPE_CHECKING: + from polars.type_aliases import TimeUnit + +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_base_utc_offset(time_unit: TimeUnit) -> None: + df = pl.datetime_range( + datetime(2011, 12, 29), + datetime(2012, 1, 1), + "2d", + time_zone="Pacific/Apia", + eager=True, + ).dt.cast_time_unit(time_unit).to_frame('a') + result = df.select(xdt.col('a').xdt.base_utc_offset().alias("base_utc_offset"))['base_utc_offset'] + expected = pl.Series( + "base_utc_offset", + [-11 * 3600 * 1000, 13 * 3600 * 1000], + dtype=pl.Duration("ms"), + ) + assert_series_equal(result, expected) + + +def test_base_utc_offset_lazy_schema() -> None: + ser = pl.datetime_range( + datetime(2020, 10, 25), + datetime(2020, 10, 26), + time_zone="Europe/London", + eager=True, + ) + df = pl.DataFrame({"ts": ser}).lazy() + result = df.with_columns(base_utc_offset=xdt.col("ts").xdt.base_utc_offset()).schema + expected = { + "ts": pl.Datetime(time_unit="us", time_zone="Europe/London"), + "base_utc_offset": pl.Duration(time_unit="ms"), + } + assert result == expected + + +def test_base_utc_offset_invalid() -> None: + df = pl.datetime_range( + datetime(2011, 12, 29), + datetime(2012, 1, 1), + "2d", + eager=True, + ).to_frame('a') + with pytest.raises( + ComputeError, + match=r"base_utc_offset only works on Datetime type", + ): + df.select(xdt.col('a').xdt.base_utc_offset()) + + +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_dst_offset(time_unit: TimeUnit) -> None: + df = pl.datetime_range( + datetime(2020, 10, 25), + datetime(2020, 10, 26), + time_zone="Europe/London", + eager=True, + ).dt.cast_time_unit(time_unit).to_frame('a') + result = df.select(xdt.col('a').xdt.dst_offset().alias("dst_offset"))['dst_offset'] + expected = pl.Series("dst_offset", [3_600 * 1_000, 0], dtype=pl.Duration("ms")) + assert_series_equal(result, expected) + + +def test_dst_offset_lazy_schema() -> None: + ser = pl.datetime_range( + datetime(2020, 10, 25), + datetime(2020, 10, 26), + time_zone="Europe/London", + eager=True, + ) + df = pl.DataFrame({"ts": ser}).lazy() + result = df.with_columns(dst_offset=xdt.col("ts").xdt.dst_offset()).schema + expected = { + "ts": pl.Datetime(time_unit="us", time_zone="Europe/London"), + "dst_offset": pl.Duration(time_unit="ms"), + } + assert result == expected + + +def test_dst_offset_invalid() -> None: + df = pl.datetime_range( + datetime(2011, 12, 29), + datetime(2012, 1, 1), + "2d", + eager=True, + ).to_frame('a') + with pytest.raises( + ComputeError, + match=r"base_utc_offset only works on Datetime type", + ): + df.select(xdt.col('a').xdt.dst_offset()) \ No newline at end of file