From 4a3e5e6d61f42f8c9d33da88b950a40833835ae6 Mon Sep 17 00:00:00 2001 From: Clement Rey Date: Fri, 14 Apr 2023 12:06:01 +0200 Subject: [PATCH] support for new datatypes --- Cargo.toml | 2 ++ polars/polars-arrow/src/conversion.rs | 2 +- .../src/chunked_array/logical/categorical/from.rs | 4 ++-- .../src/chunked_array/logical/struct_/mod.rs | 8 ++++++-- polars/polars-core/src/chunked_array/ops/full.rs | 2 +- polars/polars-core/src/datatypes/any_value.rs | 6 +++--- polars/polars-core/src/datatypes/dtype.rs | 10 ++++++---- polars/polars-core/src/datatypes/field.rs | 2 +- polars/polars-core/src/series/from.rs | 6 +++--- 9 files changed, 25 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 71d9799c344d..6a6f915c3ba5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,8 @@ either = "1.8" [workspace.dependencies.arrow] package = "arrow2" +git = "https://github.com/rerun-io/arrow2" +branch = "cmc/arc_datatype" # git = "https://github.com/jorgecarleitao/arrow2" # git = "https://github.com/ritchie46/arrow2" # rev = "f258a3e06ac408aebe7a7a497694729dc65a5e46" diff --git a/polars/polars-arrow/src/conversion.rs b/polars/polars-arrow/src/conversion.rs index 4d843a87903f..af72fbab9eb6 100644 --- a/polars/polars-arrow/src/conversion.rs +++ b/polars/polars-arrow/src/conversion.rs @@ -6,7 +6,7 @@ use arrow::types::NativeType; use crate::prelude::*; pub fn chunk_to_struct(chunk: Chunk, fields: Vec) -> StructArray { - let dtype = DataType::Struct(fields); + let dtype = DataType::Struct(std::sync::Arc::new(fields)); StructArray::new(dtype, chunk.into_arrays(), None) } diff --git a/polars/polars-core/src/chunked_array/logical/categorical/from.rs b/polars/polars-core/src/chunked_array/logical/categorical/from.rs index 14b8bae20212..ff7a723b9eca 100644 --- a/polars/polars-core/src/chunked_array/logical/categorical/from.rs +++ b/polars/polars-core/src/chunked_array/logical/categorical/from.rs @@ -12,7 +12,7 @@ impl From<&CategoricalChunked> for DictionaryArray { let map = &**ca.get_rev_map(); let dtype = ArrowDataType::Dictionary( IntegerType::UInt32, - Box::new(ArrowDataType::LargeUtf8), + Arc::new(ArrowDataType::LargeUtf8), false, ); match map { @@ -47,7 +47,7 @@ impl From<&CategoricalChunked> for DictionaryArray { let map = &**ca.get_rev_map(); let dtype = ArrowDataType::Dictionary( IntegerType::UInt32, - Box::new(ArrowDataType::LargeUtf8), + Arc::new(ArrowDataType::LargeUtf8), false, ); match map { diff --git a/polars/polars-core/src/chunked_array/logical/struct_/mod.rs b/polars/polars-core/src/chunked_array/logical/struct_/mod.rs index 43e2dc057ff9..24af2ebe06db 100644 --- a/polars/polars-core/src/chunked_array/logical/struct_/mod.rs +++ b/polars/polars-core/src/chunked_array/logical/struct_/mod.rs @@ -40,7 +40,11 @@ fn fields_to_struct_array(fields: &[Series]) -> (ArrayRef, Vec) { // we determine fields from arrays as there might be object arrays // where the dtype is bound to that single array let new_fields = arrays_to_fields(&field_arrays, &fields); - let arr = StructArray::new(ArrowDataType::Struct(new_fields), field_arrays, None); + let arr = StructArray::new( + ArrowDataType::Struct(Arc::new(new_fields)), + field_arrays, + None, + ); (Box::new(arr), fields) } @@ -116,7 +120,7 @@ impl StructChunked { // where the dtype is bound to that single array let new_fields = arrays_to_fields(&field_arrays, &self.fields); let arr = Box::new(StructArray::new( - ArrowDataType::Struct(new_fields), + ArrowDataType::Struct(Arc::new(new_fields)), field_arrays, None, )) as ArrayRef; diff --git a/polars/polars-core/src/chunked_array/ops/full.rs b/polars/polars-core/src/chunked_array/ops/full.rs index 1dd1ed606eeb..b15e88ef3dbb 100644 --- a/polars/polars-core/src/chunked_array/ops/full.rs +++ b/polars/polars-core/src/chunked_array/ops/full.rs @@ -104,7 +104,7 @@ impl ChunkFullNull for ListChunked { impl ListChunked { pub fn full_null_with_dtype(name: &str, length: usize, inner_dtype: &DataType) -> ListChunked { let arr = new_null_array( - ArrowDataType::LargeList(Box::new(ArrowField::new( + ArrowDataType::LargeList(Arc::new(ArrowField::new( "item", inner_dtype.to_arrow(), true, diff --git a/polars/polars-core/src/datatypes/any_value.rs b/polars/polars-core/src/datatypes/any_value.rs index 788d60153428..455b8a3a0f4b 100644 --- a/polars/polars-core/src/datatypes/any_value.rs +++ b/polars/polars-core/src/datatypes/any_value.rs @@ -793,7 +793,7 @@ mod test { DataType::Datetime(TimeUnit::Milliseconds, None), ), ( - ArrowDataType::Timestamp(ArrowTimeUnit::Second, Some("".to_string())), + ArrowDataType::Timestamp(ArrowTimeUnit::Second, Some(Arc::new("".to_string()))), DataType::Datetime(TimeUnit::Milliseconds, Some("".to_string())), ), (ArrowDataType::LargeUtf8, DataType::Utf8), @@ -827,7 +827,7 @@ mod test { ), (ArrowDataType::Time32(ArrowTimeUnit::Second), DataType::Time), ( - ArrowDataType::List(Box::new(ArrowField::new( + ArrowDataType::List(Arc::new(ArrowField::new( "item", ArrowDataType::Float64, true, @@ -835,7 +835,7 @@ mod test { DataType::List(DataType::Float64.into()), ), ( - ArrowDataType::LargeList(Box::new(ArrowField::new( + ArrowDataType::LargeList(Arc::new(ArrowField::new( "item", ArrowDataType::Float64, true, diff --git a/polars/polars-core/src/datatypes/dtype.rs b/polars/polars-core/src/datatypes/dtype.rs index 72fe17ca217b..3393b9c50f64 100644 --- a/polars/polars-core/src/datatypes/dtype.rs +++ b/polars/polars-core/src/datatypes/dtype.rs @@ -216,10 +216,12 @@ impl DataType { Utf8 => ArrowDataType::LargeUtf8, Binary => ArrowDataType::LargeBinary, Date => ArrowDataType::Date32, - Datetime(unit, tz) => ArrowDataType::Timestamp(unit.to_arrow(), tz.clone()), + Datetime(unit, tz) => { + ArrowDataType::Timestamp(unit.to_arrow(), tz.clone().map(Arc::new)) + } Duration(unit) => ArrowDataType::Duration(unit.to_arrow()), Time => ArrowDataType::Time64(ArrowTimeUnit::Nanosecond), - List(dt) => ArrowDataType::LargeList(Box::new(arrow::datatypes::Field::new( + List(dt) => ArrowDataType::LargeList(Arc::new(arrow::datatypes::Field::new( "item", dt.to_arrow(), true, @@ -230,13 +232,13 @@ impl DataType { #[cfg(feature = "dtype-categorical")] Categorical(_) => ArrowDataType::Dictionary( IntegerType::UInt32, - Box::new(ArrowDataType::LargeUtf8), + Arc::new(ArrowDataType::LargeUtf8), false, ), #[cfg(feature = "dtype-struct")] Struct(fields) => { let fields = fields.iter().map(|fld| fld.to_arrow()).collect(); - ArrowDataType::Struct(fields) + ArrowDataType::Struct(Arc::new(fields)) } Unknown => unreachable!(), } diff --git a/polars/polars-core/src/datatypes/field.rs b/polars/polars-core/src/datatypes/field.rs index 3aa501458275..832c8e99a943 100644 --- a/polars/polars-core/src/datatypes/field.rs +++ b/polars/polars-core/src/datatypes/field.rs @@ -127,7 +127,7 @@ impl From<&ArrowDataType> for DataType { ArrowDataType::LargeList(f) => DataType::List(Box::new(f.data_type().into())), ArrowDataType::List(f) => DataType::List(Box::new(f.data_type().into())), ArrowDataType::Date32 => DataType::Date, - ArrowDataType::Timestamp(tu, tz) => DataType::Datetime(tu.into(), tz.clone()), + ArrowDataType::Timestamp(tu, tz) => DataType::Datetime(tu.into(), tz.as_ref().map(|tz| tz.to_string())), ArrowDataType::Duration(tu) => DataType::Duration(tu.into()), ArrowDataType::Date64 => DataType::Datetime(TimeUnit::Milliseconds, None), ArrowDataType::LargeUtf8 | ArrowDataType::Utf8 => DataType::Utf8, diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index f8d05260a855..fcd60711922d 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -154,13 +154,13 @@ impl Series { #[cfg(feature = "dtype-datetime")] ArrowDataType::Timestamp(tu, tz) => { let mut tz = tz.clone(); - if tz.as_deref() == Some("") { + if tz.as_ref().map(|tz| tz.as_str()) == Some("") { tz = None; } // we still drop timezone for now let chunks = cast_chunks(&chunks, &DataType::Int64, false).unwrap(); let s = Int64Chunked::from_chunks(name, chunks) - .into_datetime(tu.into(), tz) + .into_datetime(tu.into(), tz.map(|tz| tz.to_string())) .into_series(); Ok(match tu { ArrowTimeUnit::Second => &s * MILLISECONDS, @@ -461,7 +461,7 @@ fn convert_inner_types(arr: &ArrayRef) -> ArrayRef { .map(|(arr, field)| ArrowField::new(&field.name, arr.data_type().clone(), true)) .collect(); Box::new(StructArray::new( - ArrowDataType::Struct(fields), + ArrowDataType::Struct(Arc::new(fields)), values, arr.validity().cloned(), ))