From 6dfb5039913a0cb8db2b0e06aaca484500304e0c Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 14 Aug 2024 14:21:07 -0400 Subject: [PATCH] Export schema from data objects --- arro3-core/python/arro3/core/_core.pyi | 66 ++++++++++++++++++++++++++ pyo3-arrow/src/array.rs | 6 ++- pyo3-arrow/src/array_reader.rs | 6 ++- pyo3-arrow/src/chunked.rs | 5 ++ pyo3-arrow/src/ffi/to_python/utils.rs | 9 +++- pyo3-arrow/src/record_batch.rs | 5 ++ pyo3-arrow/src/record_batch_reader.rs | 5 ++ pyo3-arrow/src/table.rs | 5 ++ 8 files changed, 103 insertions(+), 4 deletions(-) diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi index 63752bf..6882e01 100644 --- a/arro3-core/python/arro3/core/_core.pyi +++ b/arro3-core/python/arro3/core/_core.pyi @@ -43,6 +43,17 @@ class Array: For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this array into a pyarrow array, without copying memory. """ + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + This allows Arrow consumers to inspect the data type of this array. Then the + consumer can ask the producer (in `__arrow_c_array__`) to cast the exported data + to a supported data type. + """ def __eq__(self, other) -> bool: ... def __len__(self) -> int: ... def __repr__(self) -> str: ... @@ -111,6 +122,17 @@ class ArrayReader: item yielded from the stream is an [`Array`][arro3.core.Array], not a [`RecordBatch`][arro3.core.RecordBatch]. """ + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + This allows Arrow consumers to inspect the data type of this ArrayReader. Then + the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported + data to a supported data type. + """ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """ An implementation of the [Arrow PyCapsule @@ -171,6 +193,17 @@ class ChunkedArray: An implementation of the Array interface, for interoperability with numpy and other array libraries. """ + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + This allows Arrow consumers to inspect the data type of this ChunkedArray. Then + the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported + data to a supported data type. + """ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """ An implementation of the [Arrow PyCapsule @@ -823,6 +856,17 @@ class RecordBatch: For example, you can call [`pyarrow.record_batch()`][pyarrow.record_batch] to convert this RecordBatch into a pyarrow RecordBatch, without copying memory. """ + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + This allows Arrow consumers to inspect the data type of this RecordBatch. Then + the consumer can ask the producer (in `__arrow_c_array__`) to cast the exported + data to a supported data type. + """ def __eq__(self, other) -> bool: ... def __getitem__(self, key: int | str) -> Array: ... def __repr__(self) -> str: ... @@ -1029,6 +1073,17 @@ class RecordBatchReader: A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch]. """ + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + This allows Arrow consumers to inspect the data type of this RecordBatchReader. + Then the consumer can ask the producer (in `__arrow_c_stream__`) to cast the + exported data to a supported data type. + """ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """ An implementation of the [Arrow PyCapsule @@ -1304,6 +1359,17 @@ class Table: schema: The expected schema of the Arrow Table. If not passed, will be inferred from the data. Mutually exclusive with 'names' argument. Defaults to None. metadata: Optional metadata for the schema (if schema not passed). Defaults to None. """ + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + This allows Arrow consumers to inspect the data type of this Table. Then the + consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported + data to a supported data type. + """ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """ An implementation of the [Arrow PyCapsule diff --git a/pyo3-arrow/src/array.rs b/pyo3-arrow/src/array.rs index 17c4f17..2899bc6 100644 --- a/pyo3-arrow/src/array.rs +++ b/pyo3-arrow/src/array.rs @@ -19,8 +19,8 @@ use pyo3::types::{PyCapsule, PyTuple, PyType}; use crate::error::PyArrowResult; use crate::ffi::from_python::utils::import_array_pycapsules; -use crate::ffi::to_array_pycapsules; use crate::ffi::to_python::nanoarrow::to_nanoarrow_array; +use crate::ffi::{to_array_pycapsules, to_schema_pycapsule}; use crate::input::AnyArray; use crate::interop::numpy::from_numpy::from_numpy; use crate::interop::numpy::to_numpy::to_numpy; @@ -226,6 +226,10 @@ impl PyArray { to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema) } + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { + to_schema_pycapsule(py, self.field.as_ref()) + } + fn __eq__(&self, other: &PyArray) -> bool { self.array.as_ref() == other.array.as_ref() && self.field == other.field } diff --git a/pyo3-arrow/src/array_reader.rs b/pyo3-arrow/src/array_reader.rs index ea7b8b1..f389f4a 100644 --- a/pyo3-arrow/src/array_reader.rs +++ b/pyo3-arrow/src/array_reader.rs @@ -11,7 +11,7 @@ use crate::ffi::from_python::ffi_stream::ArrowArrayStreamReader; use crate::ffi::from_python::utils::import_stream_pycapsule; use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream; use crate::ffi::to_python::to_stream_pycapsule; -use crate::ffi::{ArrayIterator, ArrayReader}; +use crate::ffi::{to_schema_pycapsule, ArrayIterator, ArrayReader}; use crate::input::AnyArray; use crate::{PyArray, PyChunkedArray, PyField}; @@ -103,6 +103,10 @@ impl Display for PyArrayReader { #[pymethods] impl PyArrayReader { + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { + to_schema_pycapsule(py, self.field_ref()?.as_ref()) + } + #[allow(unused_variables)] fn __arrow_c_stream__<'py>( &'py mut self, diff --git a/pyo3-arrow/src/chunked.rs b/pyo3-arrow/src/chunked.rs index d26902a..016a9cc 100644 --- a/pyo3-arrow/src/chunked.rs +++ b/pyo3-arrow/src/chunked.rs @@ -15,6 +15,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule; use crate::ffi::to_python::chunked::ArrayIterator; use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream; use crate::ffi::to_python::to_stream_pycapsule; +use crate::ffi::to_schema_pycapsule; use crate::input::AnyArray; use crate::interop::numpy::to_numpy::chunked_to_numpy; use crate::{PyArray, PyDataType, PyField}; @@ -261,6 +262,10 @@ impl PyChunkedArray { chunked_to_numpy(py, chunk_refs.as_slice()) } + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { + to_schema_pycapsule(py, self.field.as_ref()) + } + #[allow(unused_variables)] fn __arrow_c_stream__<'py>( &'py self, diff --git a/pyo3-arrow/src/ffi/to_python/utils.rs b/pyo3-arrow/src/ffi/to_python/utils.rs index 0af3d09..9f46e60 100644 --- a/pyo3-arrow/src/ffi/to_python/utils.rs +++ b/pyo3-arrow/src/ffi/to_python/utils.rs @@ -40,7 +40,8 @@ pub fn to_array_pycapsules<'py>( // Note: we don't import a Field directly because the name might not be set. // https://github.com/apache/arrow-rs/issues/6251 let data_type = DataType::try_from(schema_ptr)?; - let field = Arc::new(Field::new("", data_type, true)); + let field = + Arc::new(Field::new("", data_type, true).with_metadata(field.metadata().clone())); let casted_array = cast(array, field.data_type())?; (casted_array.to_data(), field) @@ -72,10 +73,14 @@ pub fn to_stream_pycapsule<'py>( if let Some(capsule) = requested_schema { let schema_ptr = import_schema_pycapsule(&capsule)?; + let existing_field = array_reader.field(); + // Note: we don't import a Field directly because the name might not be set. // https://github.com/apache/arrow-rs/issues/6251 let data_type = DataType::try_from(schema_ptr)?; - let field = Arc::new(Field::new("", data_type, true)); + let field = Arc::new( + Field::new("", data_type, true).with_metadata(existing_field.metadata().clone()), + ); let output_field = field.clone(); let array_iter = array_reader.map(move |array| { diff --git a/pyo3-arrow/src/record_batch.rs b/pyo3-arrow/src/record_batch.rs index 585303b..124c2ee 100644 --- a/pyo3-arrow/src/record_batch.rs +++ b/pyo3-arrow/src/record_batch.rs @@ -15,6 +15,7 @@ use crate::error::PyArrowResult; use crate::ffi::from_python::utils::import_array_pycapsules; use crate::ffi::to_python::nanoarrow::to_nanoarrow_array; use crate::ffi::to_python::to_array_pycapsules; +use crate::ffi::to_schema_pycapsule; use crate::input::{AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices}; use crate::schema::display_schema; use crate::{PyArray, PyField, PySchema}; @@ -133,6 +134,10 @@ impl PyRecordBatch { to_array_pycapsules(py, field.into(), &array, requested_schema) } + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { + to_schema_pycapsule(py, self.0.schema_ref().as_ref()) + } + fn __eq__(&self, other: &PyRecordBatch) -> bool { self.0 == other.0 } diff --git a/pyo3-arrow/src/record_batch_reader.rs b/pyo3-arrow/src/record_batch_reader.rs index 6537be0..d21eb04 100644 --- a/pyo3-arrow/src/record_batch_reader.rs +++ b/pyo3-arrow/src/record_batch_reader.rs @@ -13,6 +13,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule; use crate::ffi::to_python::chunked::ArrayIterator; use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream; use crate::ffi::to_python::to_stream_pycapsule; +use crate::ffi::to_schema_pycapsule; use crate::input::AnyRecordBatch; use crate::schema::display_schema; use crate::{PyRecordBatch, PySchema, PyTable}; @@ -116,6 +117,10 @@ impl Display for PyRecordBatchReader { #[pymethods] impl PyRecordBatchReader { + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { + to_schema_pycapsule(py, self.schema_ref()?.as_ref()) + } + #[allow(unused_variables)] fn __arrow_c_stream__<'py>( &'py mut self, diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs index b47d170..0c4afa7 100644 --- a/pyo3-arrow/src/table.rs +++ b/pyo3-arrow/src/table.rs @@ -17,6 +17,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule; use crate::ffi::to_python::chunked::ArrayIterator; use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream; use crate::ffi::to_python::to_stream_pycapsule; +use crate::ffi::to_schema_pycapsule; use crate::input::{ AnyArray, AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices, }; @@ -191,6 +192,10 @@ impl PyTable { } } + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { + to_schema_pycapsule(py, self.schema.as_ref()) + } + #[allow(unused_variables)] fn __arrow_c_stream__<'py>( &'py self,