Skip to content

Commit

Permalink
Port codegened arrow serialization to arrow1 (#8208)
Browse files Browse the repository at this point in the history
* Follows #8206
* Part of #3741

## Changes
To implement nullable unions, we have a `_null_marker: Null` variants in
all our unions. This means all our unions are nullable.

Previously we would only mark a struct field as nullable if it was
declared as such in the `.fbs` file, but `arrow-rs` complains about
this. So with this PR, if a struct field refers to a union type, that
struct field will be marked as `nullable: true` in the datatype (in
Rust, Python and C++).
  • Loading branch information
emilk authored Nov 25, 2024
1 parent 1202bd4 commit fb034b0
Show file tree
Hide file tree
Showing 215 changed files with 3,742 additions and 3,055 deletions.
14 changes: 8 additions & 6 deletions crates/build/re_types_builder/src/codegen/cpp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1729,12 +1729,12 @@ fn quote_fill_arrow_array_builder(
// C-style enum, encoded as a sparse arrow union
ObjectClass::Enum => {
quote! {
#parameter_check
ARROW_RETURN_NOT_OK(#builder->Reserve(static_cast<int64_t>(num_elements)));
for (size_t elem_idx = 0; elem_idx < num_elements; elem_idx += 1) {
const auto variant = elements[elem_idx];
ARROW_RETURN_NOT_OK(#builder->Append(static_cast<uint8_t>(variant)));
}
#parameter_check
ARROW_RETURN_NOT_OK(#builder->Reserve(static_cast<int64_t>(num_elements)));
for (size_t elem_idx = 0; elem_idx < num_elements; elem_idx += 1) {
const auto variant = elements[elem_idx];
ARROW_RETURN_NOT_OK(#builder->Append(static_cast<uint8_t>(variant)));
}
}
}

Expand Down Expand Up @@ -2482,6 +2482,7 @@ fn quote_arrow_field_type(
let name = &field.name;
let datatype = quote_arrow_datatype(&field.typ, objects, includes, false);
let is_nullable = field.is_nullable || field.typ == Type::Unit; // null type is always nullable
let is_nullable = is_nullable || field.typ.is_union(objects); // Rerun unions always has a `_null_marker: null` variant, so they are always nullable

quote! {
arrow::field(#name, #datatype, #is_nullable)
Expand All @@ -2496,6 +2497,7 @@ fn quote_arrow_elem_type(
let typ: Type = elem_type.clone().into();
let datatype = quote_arrow_datatype(&typ, objects, includes, false);
let is_nullable = typ == Type::Unit; // null type must be nullable
let is_nullable = is_nullable || elem_type.is_union(objects); // Rerun unions always has a `_null_marker: null` variant, so they are always nullable
quote! {
arrow::field("item", #datatype, #is_nullable)
}
Expand Down
3 changes: 2 additions & 1 deletion crates/build/re_types_builder/src/codegen/python/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2550,7 +2550,8 @@ fn quote_arrow_field(field: &Field) -> String {
} = field;

let datatype = quote_arrow_datatype(data_type);
let is_nullable = if *is_nullable { "True" } else { "False" };
let is_nullable = *is_nullable || matches!(data_type.to_logical_type(), DataType::Union { .. }); // Rerun unions always has a `_null_marker: null` variant, so they are always nullable
let is_nullable = if is_nullable { "True" } else { "False" };
let metadata = quote_metadata_map(metadata);

format!(r#"pa.field("{name}", {datatype}, nullable={is_nullable}, metadata={metadata})"#)
Expand Down
18 changes: 11 additions & 7 deletions crates/build/re_types_builder/src/codegen/rust/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -958,13 +958,13 @@ fn quote_trait_impls_for_datatype_or_component(

let quoted_serializer = if let Some(forwarded_type) = forwarded_type.as_ref() {
quote! {
fn to_arrow2_opt<'a>(
fn to_arrow_opt<'a>(
data: impl IntoIterator<Item = Option<impl Into<::std::borrow::Cow<'a, Self>>>>,
) -> SerializationResult<Box<dyn arrow2::array::Array>>
) -> SerializationResult<arrow::array::ArrayRef>
where
Self: Clone + 'a,
{
#forwarded_type::to_arrow2_opt(data.into_iter().map(|datum| {
#forwarded_type::to_arrow_opt(data.into_iter().map(|datum| {
datum.map(|datum| match datum.into() {
::std::borrow::Cow::Borrowed(datum) => ::std::borrow::Cow::Borrowed(&datum.0),
::std::borrow::Cow::Owned(datum) => ::std::borrow::Cow::Owned(datum.0),
Expand All @@ -978,9 +978,9 @@ fn quote_trait_impls_for_datatype_or_component(

quote! {
// NOTE: Don't inline this, this gets _huge_.
fn to_arrow2_opt<'a>(
fn to_arrow_opt<'a>(
data: impl IntoIterator<Item = Option<impl Into<::std::borrow::Cow<'a, Self>>>>,
) -> SerializationResult<Box<dyn arrow2::array::Array>>
) -> SerializationResult<arrow::array::ArrayRef>
where
Self: Clone + 'a
{
Expand All @@ -989,10 +989,14 @@ fn quote_trait_impls_for_datatype_or_component(

#![allow(clippy::wildcard_imports)]
#![allow(clippy::manual_is_variant_and)]
use arrow::datatypes::*;
use arrow2::array::*;
use arrow::{array::*, buffer::*, datatypes::*};
use ::re_types_core::{Loggable as _, ResultExt as _};

#[allow(unused)]
fn as_array_ref<T: Array + 'static>(t: T) -> ArrayRef {
std::sync::Arc::new(t) as ArrayRef
}

Ok(#quoted_serializer)
}
}
Expand Down
53 changes: 46 additions & 7 deletions crates/build/re_types_builder/src/codegen/rust/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ use quote::quote;
// ---

/// `(Datatype, is_recursive)`
///
/// If `is_recursive` is set to `true`,
/// then the generate code will often be shorter, as it it will
/// defer to calling `arrow_datatype()` on the inner type.
pub struct ArrowDataTypeTokenizer<'a>(pub &'a ::arrow2::datatypes::DataType, pub bool);

impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
Expand All @@ -31,18 +35,18 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
DataType::LargeUtf8 => quote!(DataType::LargeUtf8),

DataType::List(field) => {
let field = ArrowFieldTokenizer(field);
let field = ArrowFieldTokenizer::new(field);
quote!(DataType::List(std::sync::Arc::new(#field)))
}

DataType::FixedSizeList(field, length) => {
let field = ArrowFieldTokenizer(field);
let field = ArrowFieldTokenizer::new(field);
let length = Literal::usize_unsuffixed(*length);
quote!(DataType::FixedSizeList(std::sync::Arc::new(#field), #length))
}

DataType::Union(fields, types, mode) => {
let fields = fields.iter().map(ArrowFieldTokenizer);
let fields = fields.iter().map(ArrowFieldTokenizer::new);
let mode = match mode {
UnionMode::Dense => quote!(UnionMode::Dense),
UnionMode::Sparse => quote!(UnionMode::Sparse),
Expand All @@ -66,18 +70,20 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
}

DataType::Struct(fields) => {
let fields = fields.iter().map(ArrowFieldTokenizer);
let fields = fields.iter().map(ArrowFieldTokenizer::new);
quote!(DataType::Struct(Fields::from(vec![ #(#fields,)* ])))
}

DataType::Extension(fqname, datatype, _metadata) => {
if *recursive {
// TODO(emilk): if the logical datatype is a primitive, then we can just use it directly
// so we get shorter generated code.
let fqname_use = quote_fqname_as_type_path(fqname);
quote!(<#fqname_use>::arrow_datatype())
} else {
let datatype = ArrowDataTypeTokenizer(datatype.to_logical_type(), false);
quote!(#datatype)
// TODO(cmc): Bring back extensions once we've fully replaced `arrow2-convert`!
// TODO(#3741): Bring back extensions once we've fully replaced `arrow2-convert`!
// let datatype = ArrowDataTypeTokenizer(datatype, false);
// let metadata = OptionTokenizer(metadata.as_ref());
// quote!(DataType::Extension(#fqname.to_owned(), Box::new(#datatype), #metadata))
Expand All @@ -90,16 +96,30 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
}
}

pub struct ArrowFieldTokenizer<'a>(pub &'a ::arrow2::datatypes::Field);
pub struct ArrowFieldTokenizer<'a> {
field: &'a ::arrow2::datatypes::Field,
}

impl<'a> ArrowFieldTokenizer<'a> {
pub fn new(field: &'a ::arrow2::datatypes::Field) -> Self {
Self { field }
}
}

impl quote::ToTokens for ArrowFieldTokenizer<'_> {
fn to_tokens(&self, tokens: &mut TokenStream) {
let Self { field } = self;
let arrow2::datatypes::Field {
name,
data_type,
is_nullable,
metadata,
} = &self.0;
} = field;

// Unions in Rerun always has a `_null_markers` arm, so all unions are nullable,
// whether they are specified as such or not.
let is_nullable =
*is_nullable || matches!(field.data_type.to_logical_type(), DataType::Union { .. });

let datatype = ArrowDataTypeTokenizer(data_type, true);

Expand Down Expand Up @@ -164,3 +184,22 @@ pub fn is_backed_by_arrow_buffer(typ: &DataType) -> bool {
| DataType::Float64
)
}

pub fn quoted_arrow_primitive_type(datatype: &DataType) -> TokenStream {
match datatype {
DataType::Null => quote!(NullType),
DataType::Boolean => quote!(BooleanType),
DataType::Int8 => quote!(Int8Type),
DataType::Int16 => quote!(Int16Type),
DataType::Int32 => quote!(Int32Type),
DataType::Int64 => quote!(Int64Type),
DataType::UInt8 => quote!(UInt8Type),
DataType::UInt16 => quote!(UInt16Type),
DataType::UInt32 => quote!(UInt32Type),
DataType::UInt64 => quote!(UInt64Type),
DataType::Float16 => quote!(Float16Type),
DataType::Float32 => quote!(Float32Type),
DataType::Float64 => quote!(Float64Type),
_ => unimplemented!("Not a primitive type: {datatype:#?}"),
}
}
Loading

0 comments on commit fb034b0

Please sign in to comment.