From 6b7f8f075c816d105280c8aa34b872dc7fecfb29 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 25 Feb 2024 13:11:07 +0000 Subject: [PATCH 1/4] initial working solution! --- polars_xdt/__init__.py | 2 ++ polars_xdt/functions.py | 10 ++++++++++ src/expressions.rs | 17 +++++++++++++++++ src/lib.rs | 1 + 4 files changed, 30 insertions(+) diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index 9099b37..c1d3345 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -2,6 +2,7 @@ import polars_xdt.namespace # noqa: F401 from polars_xdt.functions import ( + previous_higher, ceil, day_name, format_localized, @@ -29,5 +30,6 @@ "to_julian_date", "to_local_datetime", "workday_count", + "previous_higher", "__version__", ] diff --git a/polars_xdt/functions.py b/polars_xdt/functions.py index 366d7a8..52ed8a4 100644 --- a/polars_xdt/functions.py +++ b/polars_xdt/functions.py @@ -731,3 +731,13 @@ def workday_count( "holidays": holidays_int, }, ) + +def previous_higher( + expr: IntoExpr +) -> pl.Expr: + expr = parse_into_expr(expr) + return expr.register_plugin( + lib=lib, + symbol="previous_higher", + is_elementwise=False, + ) diff --git a/src/expressions.rs b/src/expressions.rs index 9f5d8a2..2809844 100644 --- a/src/expressions.rs +++ b/src/expressions.rs @@ -2,6 +2,7 @@ use crate::business_days::*; use crate::format_localized::*; use crate::is_workday::*; +use crate::previous_higher::*; use crate::sub::*; use crate::timezone::*; use crate::to_julian::*; @@ -146,3 +147,19 @@ fn dst_offset(inputs: &[Series]) -> PolarsResult { _ => polars_bail!(InvalidOperation: "base_utc_offset only works on Datetime type."), } } + +fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult { + let field = Field::new(input_fields[0].name(), DataType::List(Box::new(IDX_DTYPE))); + Ok(field.clone()) +} + +#[polars_expr(output_type_func=list_idx_dtype)] +fn previous_higher(inputs: &[Series]) -> PolarsResult { + let ca = inputs[0].i64()?; + // steps: + // 1. make generic on inputs[0] + // 2. optionally accept second argument? + // or at least, take-based solution + let out = impl_previous_higher(ca); + Ok(out.into_series()) +} diff --git a/src/lib.rs b/src/lib.rs index 5698d4f..c3e9dc4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ mod sub; mod timezone; mod to_julian; mod utc_offsets; +mod previous_higher; use pyo3::types::PyModule; use pyo3::{pymodule, PyResult, Python}; From 5f9c8515007da4aec694442057f49e21e91f126d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 25 Feb 2024 13:46:10 +0000 Subject: [PATCH 2/4] wip --- polars_xdt/__init__.py | 4 ++-- polars_xdt/functions.py | 36 +++++++++++++++++++++++++++++++++-- src/arg_prev_greater_value.rs | 36 +++++++++++++++++++++++++++++++++++ src/expressions.rs | 17 ++++++++++++----- src/lib.rs | 2 +- 5 files changed, 85 insertions(+), 10 deletions(-) create mode 100644 src/arg_prev_greater_value.rs diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index c1d3345..e119d69 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -2,7 +2,7 @@ import polars_xdt.namespace # noqa: F401 from polars_xdt.functions import ( - previous_higher, + arg_prev_greater_value, ceil, day_name, format_localized, @@ -30,6 +30,6 @@ "to_julian_date", "to_local_datetime", "workday_count", - "previous_higher", + "arg_prev_greater_value", "__version__", ] diff --git a/polars_xdt/functions.py b/polars_xdt/functions.py index 52ed8a4..541e0ae 100644 --- a/polars_xdt/functions.py +++ b/polars_xdt/functions.py @@ -732,12 +732,44 @@ def workday_count( }, ) -def previous_higher( +def arg_prev_greater_value( expr: IntoExpr ) -> pl.Expr: + """ + Find the row count of the previous value greater than the current one. + + Parameters + ---------- + expr + Expression. + + Returns + ------- + Expr + UInt64 or UInt32 type, depending on the platform. + + Examples + -------- + >>> import polars as pl + >>> import polars_xdt as xdt + >>> df = pl.DataFrame({'value': [1, 9, 6, 7, 3]}) + >>> df.with_columns(result=xdt.arg_prev_greater_value('value')) + shape: (5, 2) + ┌───────┬────────┐ + │ value ┆ result │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═══════╪════════╡ + │ 1 ┆ null │ + │ 9 ┆ 0 │ + │ 6 ┆ 1 │ + │ 7 ┆ 2 │ + │ 3 ┆ 1 │ + └───────┴────────┘ + """ expr = parse_into_expr(expr) return expr.register_plugin( lib=lib, - symbol="previous_higher", + symbol="arg_prev_greater_value", is_elementwise=False, ) diff --git a/src/arg_prev_greater_value.rs b/src/arg_prev_greater_value.rs new file mode 100644 index 0000000..b5283b8 --- /dev/null +++ b/src/arg_prev_greater_value.rs @@ -0,0 +1,36 @@ +use polars::prelude::*; + +pub(crate) fn impl_arg_prev_greater_value(ca: &ChunkedArray) -> IdxCa + where T: PolarsNumericType { + let mut idx: Vec> = Vec::with_capacity(ca.len()); + let out: IdxCa = ca + .into_iter() + .enumerate() + .map(|(i, opt_val)| { + if opt_val.is_none() { + idx.push(None); + return None + } + let i_curr = i; + let mut i = Some((i as i32) - 1); // look at previous element + while i >= Some(0) && ca.get(i.unwrap() as usize).is_none() { + // find previous non-null value + i = Some(i.unwrap()-1) + } + if i < Some(0) { + idx.push(None); + return None + } + while i.is_some() && opt_val >= ca.get(i.unwrap() as usize) { + i = idx[i.unwrap() as usize]; + } + if i.is_none() { + idx.push(None); + return Some(i_curr as IdxSize) + } + idx.push(i); + i.map(|x| x as IdxSize) + }) + .collect(); + out +} diff --git a/src/expressions.rs b/src/expressions.rs index 2809844..348d3df 100644 --- a/src/expressions.rs +++ b/src/expressions.rs @@ -2,7 +2,7 @@ use crate::business_days::*; use crate::format_localized::*; use crate::is_workday::*; -use crate::previous_higher::*; +use crate::arg_prev_greater_value::*; use crate::sub::*; use crate::timezone::*; use crate::to_julian::*; @@ -154,12 +154,19 @@ fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult { } #[polars_expr(output_type_func=list_idx_dtype)] -fn previous_higher(inputs: &[Series]) -> PolarsResult { - let ca = inputs[0].i64()?; +fn arg_prev_greater_value(inputs: &[Series]) -> PolarsResult { + let ser = &inputs[0]; // steps: // 1. make generic on inputs[0] // 2. optionally accept second argument? // or at least, take-based solution - let out = impl_previous_higher(ca); - Ok(out.into_series()) + match ser.dtype() { + DataType::Int64 => Ok(impl_arg_prev_greater_value(ser.i64().unwrap()).into_series()), + DataType::Int32 => Ok(impl_arg_prev_greater_value(ser.i32().unwrap()).into_series()), + DataType::UInt64 => Ok(impl_arg_prev_greater_value(ser.u64().unwrap()).into_series()), + DataType::UInt32 => Ok(impl_arg_prev_greater_value(ser.u32().unwrap()).into_series()), + DataType::Float64 => Ok(impl_arg_prev_greater_value(ser.f64().unwrap()).into_series()), + DataType::Float32 => Ok(impl_arg_prev_greater_value(ser.f32().unwrap()).into_series()), + dt => polars_bail!(ComputeError:"Expected numeric data type, got: {}", dt) + } } diff --git a/src/lib.rs b/src/lib.rs index c3e9dc4..da8250e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ mod sub; mod timezone; mod to_julian; mod utc_offsets; -mod previous_higher; +mod arg_prev_greater_value; use pyo3::types::PyModule; use pyo3::{pymodule, PyResult, Python}; From 6e11ff522ade137a5764fc77429fd5c7eb4defe3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 25 Feb 2024 16:49:15 +0000 Subject: [PATCH 3/4] add arg_previous_greater --- polars_xdt/__init__.py | 4 +- polars_xdt/functions.py | 76 ++++++++++++++++--- ...eater_value.rs => arg_previous_greater.rs} | 16 ++-- src/expressions.rs | 22 +++--- src/lib.rs | 2 +- 5 files changed, 86 insertions(+), 34 deletions(-) rename src/{arg_prev_greater_value.rs => arg_previous_greater.rs} (71%) diff --git a/polars_xdt/__init__.py b/polars_xdt/__init__.py index e119d69..abe34c1 100644 --- a/polars_xdt/__init__.py +++ b/polars_xdt/__init__.py @@ -2,7 +2,7 @@ import polars_xdt.namespace # noqa: F401 from polars_xdt.functions import ( - arg_prev_greater_value, + arg_previous_greater, ceil, day_name, format_localized, @@ -30,6 +30,6 @@ "to_julian_date", "to_local_datetime", "workday_count", - "arg_prev_greater_value", + "arg_previous_greater", "__version__", ] diff --git a/polars_xdt/functions.py b/polars_xdt/functions.py index 541e0ae..38307bd 100644 --- a/polars_xdt/functions.py +++ b/polars_xdt/functions.py @@ -732,9 +732,8 @@ def workday_count( }, ) -def arg_prev_greater_value( - expr: IntoExpr -) -> pl.Expr: + +def arg_previous_greater(expr: IntoExpr) -> pl.Expr: """ Find the row count of the previous value greater than the current one. @@ -742,18 +741,18 @@ def arg_prev_greater_value( ---------- expr Expression. - + Returns ------- Expr UInt64 or UInt32 type, depending on the platform. - + Examples -------- >>> import polars as pl >>> import polars_xdt as xdt - >>> df = pl.DataFrame({'value': [1, 9, 6, 7, 3]}) - >>> df.with_columns(result=xdt.arg_prev_greater_value('value')) + >>> df = pl.DataFrame({"value": [1, 9, 6, 7, 3]}) + >>> df.with_columns(result=xdt.arg_previous_greater("value")) shape: (5, 2) ┌───────┬────────┐ │ value ┆ result │ @@ -761,15 +760,70 @@ def arg_prev_greater_value( │ i64 ┆ u32 │ ╞═══════╪════════╡ │ 1 ┆ null │ - │ 9 ┆ 0 │ + │ 9 ┆ 1 │ │ 6 ┆ 1 │ - │ 7 ┆ 2 │ - │ 3 ┆ 1 │ + │ 7 ┆ 1 │ + │ 3 ┆ 3 │ └───────┴────────┘ + + This can be useful when working with time series. For example, + if you a dataset like this: + + >>> df = pl.DataFrame( + ... { + ... "date": [ + ... "2024-02-01", + ... "2024-02-02", + ... "2024-02-03", + ... "2024-02-04", + ... "2024-02-05", + ... "2024-02-06", + ... "2024-02-07", + ... "2024-02-08", + ... "2024-02-09", + ... "2024-02-10", + ... ], + ... "group": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"], + ... "value": [1, 9, None, 7, 3, 2, 4, 5, 1, 9], + ... } + ... ) + >>> df = df.with_columns(pl.col("date").str.to_date()) + + and want find out, for each day and each item, how many days it's + been since `'value'` was higher than it currently is, you could do + + >>> df.with_columns( + ... result=( + ... ( + ... pl.col("date") + ... - pl.col("date") + ... .gather(xdt.arg_previous_greater("value")) + ... .over("group") + ... ).dt.total_days() + ... ), + ... ) + shape: (10, 4) + ┌────────────┬───────┬───────┬────────┐ + │ date ┆ group ┆ value ┆ result │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ i64 ┆ i64 │ + ╞════════════╪═══════╪═══════╪════════╡ + │ 2024-02-01 ┆ A ┆ 1 ┆ null │ + │ 2024-02-02 ┆ A ┆ 9 ┆ 0 │ + │ 2024-02-03 ┆ A ┆ null ┆ null │ + │ 2024-02-04 ┆ A ┆ 7 ┆ 2 │ + │ 2024-02-05 ┆ A ┆ 3 ┆ 1 │ + │ 2024-02-06 ┆ B ┆ 2 ┆ null │ + │ 2024-02-07 ┆ B ┆ 4 ┆ 0 │ + │ 2024-02-08 ┆ B ┆ 5 ┆ 0 │ + │ 2024-02-09 ┆ B ┆ 1 ┆ 1 │ + │ 2024-02-10 ┆ B ┆ 9 ┆ 0 │ + └────────────┴───────┴───────┴────────┘ + """ expr = parse_into_expr(expr) return expr.register_plugin( lib=lib, - symbol="arg_prev_greater_value", + symbol="arg_previous_greater", is_elementwise=False, ) diff --git a/src/arg_prev_greater_value.rs b/src/arg_previous_greater.rs similarity index 71% rename from src/arg_prev_greater_value.rs rename to src/arg_previous_greater.rs index b5283b8..b3e4907 100644 --- a/src/arg_prev_greater_value.rs +++ b/src/arg_previous_greater.rs @@ -1,7 +1,9 @@ use polars::prelude::*; -pub(crate) fn impl_arg_prev_greater_value(ca: &ChunkedArray) -> IdxCa - where T: PolarsNumericType { +pub(crate) fn impl_arg_previous_greater(ca: &ChunkedArray) -> IdxCa +where + T: PolarsNumericType, +{ let mut idx: Vec> = Vec::with_capacity(ca.len()); let out: IdxCa = ca .into_iter() @@ -9,24 +11,24 @@ pub(crate) fn impl_arg_prev_greater_value(ca: &ChunkedArray) -> IdxCa .map(|(i, opt_val)| { if opt_val.is_none() { idx.push(None); - return None + return None; } let i_curr = i; - let mut i = Some((i as i32) - 1); // look at previous element + let mut i = Some((i as i32) - 1); // look at previous element while i >= Some(0) && ca.get(i.unwrap() as usize).is_none() { // find previous non-null value - i = Some(i.unwrap()-1) + i = Some(i.unwrap() - 1) } if i < Some(0) { idx.push(None); - return None + return None; } while i.is_some() && opt_val >= ca.get(i.unwrap() as usize) { i = idx[i.unwrap() as usize]; } if i.is_none() { idx.push(None); - return Some(i_curr as IdxSize) + return Some(i_curr as IdxSize); } idx.push(i); i.map(|x| x as IdxSize) diff --git a/src/expressions.rs b/src/expressions.rs index 348d3df..8331725 100644 --- a/src/expressions.rs +++ b/src/expressions.rs @@ -1,8 +1,8 @@ #![allow(clippy::unit_arg, clippy::unused_unit)] +use crate::arg_previous_greater::*; use crate::business_days::*; use crate::format_localized::*; use crate::is_workday::*; -use crate::arg_prev_greater_value::*; use crate::sub::*; use crate::timezone::*; use crate::to_julian::*; @@ -154,19 +154,15 @@ fn list_idx_dtype(input_fields: &[Field]) -> PolarsResult { } #[polars_expr(output_type_func=list_idx_dtype)] -fn arg_prev_greater_value(inputs: &[Series]) -> PolarsResult { +fn arg_previous_greater(inputs: &[Series]) -> PolarsResult { let ser = &inputs[0]; - // steps: - // 1. make generic on inputs[0] - // 2. optionally accept second argument? - // or at least, take-based solution match ser.dtype() { - DataType::Int64 => Ok(impl_arg_prev_greater_value(ser.i64().unwrap()).into_series()), - DataType::Int32 => Ok(impl_arg_prev_greater_value(ser.i32().unwrap()).into_series()), - DataType::UInt64 => Ok(impl_arg_prev_greater_value(ser.u64().unwrap()).into_series()), - DataType::UInt32 => Ok(impl_arg_prev_greater_value(ser.u32().unwrap()).into_series()), - DataType::Float64 => Ok(impl_arg_prev_greater_value(ser.f64().unwrap()).into_series()), - DataType::Float32 => Ok(impl_arg_prev_greater_value(ser.f32().unwrap()).into_series()), - dt => polars_bail!(ComputeError:"Expected numeric data type, got: {}", dt) + DataType::Int64 => Ok(impl_arg_previous_greater(ser.i64().unwrap()).into_series()), + DataType::Int32 => Ok(impl_arg_previous_greater(ser.i32().unwrap()).into_series()), + DataType::UInt64 => Ok(impl_arg_previous_greater(ser.u64().unwrap()).into_series()), + DataType::UInt32 => Ok(impl_arg_previous_greater(ser.u32().unwrap()).into_series()), + DataType::Float64 => Ok(impl_arg_previous_greater(ser.f64().unwrap()).into_series()), + DataType::Float32 => Ok(impl_arg_previous_greater(ser.f32().unwrap()).into_series()), + dt => polars_bail!(ComputeError:"Expected numeric data type, got: {}", dt), } } diff --git a/src/lib.rs b/src/lib.rs index da8250e..dbc8d40 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +mod arg_previous_greater; mod business_days; mod expressions; mod format_localized; @@ -6,7 +7,6 @@ mod sub; mod timezone; mod to_julian; mod utc_offsets; -mod arg_prev_greater_value; use pyo3::types::PyModule; use pyo3::{pymodule, PyResult, Python}; From 0f51e6081412fd91414f43133434d054201ff4cd Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 25 Feb 2024 16:58:44 +0000 Subject: [PATCH 4/4] Bump version to 0.13.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 134c902..cb53990 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "polars_xdt" -version = "0.12.11" +version = "0.13.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html