diff --git a/Cargo.toml b/Cargo.toml index c79db31..fa82c5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ members = [ polars = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-features = false } polars-core = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-features = false } polars-ffi = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-features = false } - polars-plan = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-feautres = false } + polars-plan = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-features = false } polars-lazy = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-features = false } polars-time = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", features = ["timezones"], default-features = false } polars-ops = { git = "https://github.com/pola-rs/polars", rev = "d00a43203b3ade009a5f858f4c698b6a50f5b1e6", version = "0.33.2", default-features = false } diff --git a/example/derive_expression/expression_lib/.gitignore b/example/derive_expression/expression_lib/.gitignore deleted file mode 100644 index af3ca5e..0000000 --- a/example/derive_expression/expression_lib/.gitignore +++ /dev/null @@ -1,72 +0,0 @@ -/target - -# Byte-compiled / optimized / DLL files -__pycache__/ -.pytest_cache/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -.venv/ -env/ -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -include/ -man/ -venv/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt -pip-selfcheck.json - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# Rope -.ropeproject - -# Django stuff: -*.log -*.pot - -.DS_Store - -# Sphinx documentation -docs/_build/ - -# PyCharm -.idea/ - -# VSCode -.vscode/ - -# Pyenv -.python-version \ No newline at end of file diff --git a/example/derive_expression/expression_lib/Cargo.toml b/example/derive_expression/expression_lib/Cargo.toml deleted file mode 100644 index 9c85b1f..0000000 --- a/example/derive_expression/expression_lib/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "expression_lib" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[lib] -name = "expression_lib" -crate-type = ["cdylib"] - -[dependencies] -jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } -polars = { workspace = true, features = ["fmt", "dtype-date"], default-features = false } -polars-plan = { workspace = true, default-features = false } -pyo3 = { version = "0.20.0", features = ["extension-module"] } -pyo3-polars = { version = "*", path = "../../../pyo3-polars", features = ["derive"] } -serde = { version = "1", features = ["derive"] } diff --git a/example/derive_expression/expression_lib/expression_lib/__init__.py b/example/derive_expression/expression_lib/expression_lib/__init__.py deleted file mode 100644 index 9b3be74..0000000 --- a/example/derive_expression/expression_lib/expression_lib/__init__.py +++ /dev/null @@ -1,91 +0,0 @@ -import polars as pl -from polars.type_aliases import IntoExpr -from polars.utils.udfs import _get_shared_lib_location - -lib = _get_shared_lib_location(__file__) - - -@pl.api.register_expr_namespace("language") -class Language: - def __init__(self, expr: pl.Expr): - self._expr = expr - - def pig_latinnify(self) -> pl.Expr: - return self._expr._register_plugin( - lib=lib, - symbol="pig_latinnify", - is_elementwise=True, - ) - - def append_args( - self, - float_arg: float, - integer_arg: int, - string_arg: str, - boolean_arg: bool, - ) -> pl.Expr: - """ - This example shows how arguments other than `Series` can be used. - """ - return self._expr._register_plugin( - lib=lib, - args=[], - kwargs={ - "float_arg": float_arg, - "integer_arg": integer_arg, - "string_arg": string_arg, - "boolean_arg": boolean_arg, - }, - symbol="append_kwargs", - is_elementwise=True, - ) - - -@pl.api.register_expr_namespace("dist") -class Distance: - def __init__(self, expr: pl.Expr): - self._expr = expr - - def hamming_distance(self, other: IntoExpr) -> pl.Expr: - return self._expr._register_plugin( - lib=lib, - args=[other], - symbol="hamming_distance", - is_elementwise=True, - ) - - def jaccard_similarity(self, other: IntoExpr) -> pl.Expr: - return self._expr._register_plugin( - lib=lib, - args=[other], - symbol="jaccard_similarity", - is_elementwise=True, - ) - - def haversine( - self, - start_lat: IntoExpr, - start_long: IntoExpr, - end_lat: IntoExpr, - end_long: IntoExpr, - ) -> pl.Expr: - return self._expr._register_plugin( - lib=lib, - args=[start_lat, start_long, end_lat, end_long], - symbol="haversine", - is_elementwise=True, - cast_to_supertypes=True, - ) - -@pl.api.register_expr_namespace("date_util") -class DateUtil: - def __init__(self, expr: pl.Expr): - self._expr = expr - - - def is_leap_year(self) -> pl.Expr: - return self._expr._register_plugin( - lib=lib, - symbol="is_leap_year", - is_elementwise=True, - ) diff --git a/example/derive_expression/expression_lib/pyproject.toml b/example/derive_expression/expression_lib/pyproject.toml deleted file mode 100644 index 607be31..0000000 --- a/example/derive_expression/expression_lib/pyproject.toml +++ /dev/null @@ -1,12 +0,0 @@ -[build-system] -requires = ["maturin>=1.0,<2.0"] -build-backend = "maturin" - -[project] -name = "expression_lib" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] diff --git a/example/derive_expression/expression_lib/src/distances.rs b/example/derive_expression/expression_lib/src/distances.rs deleted file mode 100644 index 943aa4a..0000000 --- a/example/derive_expression/expression_lib/src/distances.rs +++ /dev/null @@ -1,95 +0,0 @@ -use polars::datatypes::PlHashSet; -use polars::export::arrow::array::PrimitiveArray; -use polars::export::num::Float; -use polars::prelude::*; -use pyo3_polars::export::polars_core::utils::arrow::types::NativeType; -use pyo3_polars::export::polars_core::with_match_physical_integer_type; -use std::hash::Hash; - -#[allow(clippy::all)] -pub(super) fn naive_hamming_dist(a: &str, b: &str) -> u32 { - let x = a.as_bytes(); - let y = b.as_bytes(); - x.iter() - .zip(y) - .fold(0, |a, (b, c)| a + (*b ^ *c).count_ones() as u32) -} - -fn jacc_helper(a: &PrimitiveArray, b: &PrimitiveArray) -> f64 { - // convert to hashsets over Option - let s1 = a.into_iter().collect::>(); - let s2 = b.into_iter().collect::>(); - - // count the number of intersections - let s3_len = s1.intersection(&s2).count(); - // return similarity - s3_len as f64 / (s1.len() + s2.len() - s3_len) as f64 -} - -pub(super) fn naive_jaccard_sim(a: &ListChunked, b: &ListChunked) -> PolarsResult { - polars_ensure!( - a.inner_dtype() == b.inner_dtype(), - ComputeError: "inner data types don't match" - ); - polars_ensure!( - a.inner_dtype().is_integer(), - ComputeError: "inner data types must be integer" - ); - Ok(with_match_physical_integer_type!(a.inner_dtype(), |$T| { - polars::prelude::arity::binary_elementwise(a, b, |a, b| { - match (a, b) { - (Some(a), Some(b)) => { - let a = a.as_any().downcast_ref::>().unwrap(); - let b = b.as_any().downcast_ref::>().unwrap(); - Some(jacc_helper(a, b)) - }, - _ => None - } - }) - })) -} - -fn haversine_elementwise(start_lat: T, start_long: T, end_lat: T, end_long: T) -> T { - let r_in_km = T::from(6371.0).unwrap(); - let two = T::from(2.0).unwrap(); - let one = T::one(); - - let d_lat = (end_lat - start_lat).to_radians(); - let d_lon = (end_long - start_long).to_radians(); - let lat1 = (start_lat).to_radians(); - let lat2 = (end_lat).to_radians(); - - let a = ((d_lat / two).sin()) * ((d_lat / two).sin()) - + ((d_lon / two).sin()) * ((d_lon / two).sin()) * (lat1.cos()) * (lat2.cos()); - let c = two * ((a.sqrt()).atan2((one - a).sqrt())); - r_in_km * c -} - -pub(super) fn naive_haversine( - start_lat: &ChunkedArray, - start_long: &ChunkedArray, - end_lat: &ChunkedArray, - end_long: &ChunkedArray, -) -> PolarsResult> -where - T: PolarsFloatType, - T::Native: Float, -{ - let out: ChunkedArray = start_lat - .into_iter() - .zip(start_long.into_iter()) - .zip(end_lat.into_iter()) - .zip(end_long.into_iter()) - .map(|(((start_lat, start_long), end_lat), end_long)| { - let start_lat = start_lat?; - let start_long = start_long?; - let end_lat = end_lat?; - let end_long = end_long?; - Some(haversine_elementwise( - start_lat, start_long, end_lat, end_long, - )) - }) - .collect(); - - Ok(out.with_name(start_lat.name())) -} diff --git a/example/derive_expression/expression_lib/src/expressions.rs b/example/derive_expression/expression_lib/src/expressions.rs deleted file mode 100644 index e0ff2ac..0000000 --- a/example/derive_expression/expression_lib/src/expressions.rs +++ /dev/null @@ -1,107 +0,0 @@ -use polars::prelude::*; -use polars_plan::dsl::FieldsMapper; -use pyo3_polars::derive::polars_expr; -use serde::Deserialize; -use std::fmt::Write; - -fn pig_latin_str(value: &str, output: &mut String) { - if let Some(first_char) = value.chars().next() { - write!(output, "{}{}ay", &value[1..], first_char).unwrap() - } -} - -#[polars_expr(output_type=Utf8)] -fn pig_latinnify(inputs: &[Series]) -> PolarsResult { - let ca = inputs[0].utf8()?; - let out: Utf8Chunked = ca.apply_to_buffer(pig_latin_str); - Ok(out.into_series()) -} - -#[polars_expr(output_type=Float64)] -fn jaccard_similarity(inputs: &[Series]) -> PolarsResult { - let a = inputs[0].list()?; - let b = inputs[1].list()?; - crate::distances::naive_jaccard_sim(a, b).map(|ca| ca.into_series()) -} - -#[polars_expr(output_type=Float64)] -fn hamming_distance(inputs: &[Series]) -> PolarsResult { - let a = inputs[0].utf8()?; - let b = inputs[1].utf8()?; - let out: UInt32Chunked = - arity::binary_elementwise_values(a, b, crate::distances::naive_hamming_dist); - Ok(out.into_series()) -} - -fn haversine_output(input_fields: &[Field]) -> PolarsResult { - FieldsMapper::new(input_fields).map_to_float_dtype() -} - -#[polars_expr(type_func=haversine_output)] -fn haversine(inputs: &[Series]) -> PolarsResult { - let out = match inputs[0].dtype() { - DataType::Float32 => { - let start_lat = inputs[0].f32().unwrap(); - let start_long = inputs[1].f32().unwrap(); - let end_lat = inputs[2].f32().unwrap(); - let end_long = inputs[3].f32().unwrap(); - crate::distances::naive_haversine(start_lat, start_long, end_lat, end_long)? - .into_series() - } - DataType::Float64 => { - let start_lat = inputs[0].f64().unwrap(); - let start_long = inputs[1].f64().unwrap(); - let end_lat = inputs[2].f64().unwrap(); - let end_long = inputs[3].f64().unwrap(); - crate::distances::naive_haversine(start_lat, start_long, end_lat, end_long)? - .into_series() - } - _ => unimplemented!(), - }; - Ok(out) -} - -/// The `DefaultKwargs` isn't very ergonomic as it doesn't validate any schema. -/// Provide your own kwargs struct with the proper schema and accept that type -/// in your plugin expression. -#[derive(Deserialize)] -pub struct MyKwargs { - float_arg: f64, - integer_arg: i64, - string_arg: String, - boolean_arg: bool, -} - -/// If you want to accept `kwargs`. You define a `kwargs` argument -/// on the second position in you plugin. You can provide any custom struct that is deserializable -/// with the pickle protocol (on the rust side). -#[polars_expr(output_type=Utf8)] -fn append_kwargs(input: &[Series], kwargs: MyKwargs) -> PolarsResult { - let input = &input[0]; - let input = input.cast(&DataType::Utf8)?; - let ca = input.utf8().unwrap(); - - Ok(ca - .apply_to_buffer(|val, buf| { - write!( - buf, - "{}-{}-{}-{}-{}", - val, kwargs.float_arg, kwargs.integer_arg, kwargs.string_arg, kwargs.boolean_arg - ) - .unwrap() - }) - .into_series()) -} - -#[polars_expr(output_type=Boolean)] -fn is_leap_year(input: &[Series]) -> PolarsResult { - let input = &input[0]; - let ca = input.date()?; - - let out: BooleanChunked = ca - .as_date_iter() - .map(|opt_dt| opt_dt.map(|dt| dt.leap_year())) - .collect_ca(ca.name()); - - Ok(out.into_series()) -} diff --git a/example/derive_expression/expression_lib/src/lib.rs b/example/derive_expression/expression_lib/src/lib.rs deleted file mode 100644 index d5c6766..0000000 --- a/example/derive_expression/expression_lib/src/lib.rs +++ /dev/null @@ -1,9 +0,0 @@ -mod distances; -mod expressions; - -#[cfg(target_os = "linux")] -use jemallocator::Jemalloc; - -#[global_allocator] -#[cfg(target_os = "linux")] -static ALLOC: Jemalloc = Jemalloc; diff --git a/example/derive_expression/polars_business/Cargo.toml b/example/derive_expression/polars_business/Cargo.toml index b30bcb5..d06adf4 100644 --- a/example/derive_expression/polars_business/Cargo.toml +++ b/example/derive_expression/polars_business/Cargo.toml @@ -10,7 +10,6 @@ crate-type = ["cdylib"] [dependencies] ahash = "0.8.3" -jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } polars = { workspace = true, features = ["fmt", "dtype-date"], default-features = false } polars-time = { workspace = true, default-features = false } polars-plan = { workspace = true, default-features = false } @@ -18,3 +17,6 @@ polars-ops = { workspace = true, default-features = false } pyo3 = { version = "0.20.0", features = ["extension-module"] } pyo3-polars = { version = "*", path = "../../../pyo3-polars", features = ["derive"] } serde = { version = "1", features = ["derive"] } + +[target .'cfg(target_os = "linux")'] +jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }