Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export schema from data objects #134

Merged
merged 1 commit into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions arro3-core/python/arro3/core/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,17 @@ class Array:
For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this
array into a pyarrow array, without copying memory.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.

This allows Arrow consumers to inspect the data type of this array. Then the
consumer can ask the producer (in `__arrow_c_array__`) to cast the exported data
to a supported data type.
"""
def __eq__(self, other) -> bool: ...
def __len__(self) -> int: ...
def __repr__(self) -> str: ...
Expand Down Expand Up @@ -111,6 +122,17 @@ class ArrayReader:
item yielded from the stream is an [`Array`][arro3.core.Array], not a
[`RecordBatch`][arro3.core.RecordBatch].
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.

This allows Arrow consumers to inspect the data type of this ArrayReader. Then
the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down Expand Up @@ -171,6 +193,17 @@ class ChunkedArray:
An implementation of the Array interface, for interoperability with numpy and
other array libraries.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.

This allows Arrow consumers to inspect the data type of this ChunkedArray. Then
the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down Expand Up @@ -823,6 +856,17 @@ class RecordBatch:
For example, you can call [`pyarrow.record_batch()`][pyarrow.record_batch] to
convert this RecordBatch into a pyarrow RecordBatch, without copying memory.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.

This allows Arrow consumers to inspect the data type of this RecordBatch. Then
the consumer can ask the producer (in `__arrow_c_array__`) to cast the exported
data to a supported data type.
"""
def __eq__(self, other) -> bool: ...
def __getitem__(self, key: int | str) -> Array: ...
def __repr__(self) -> str: ...
Expand Down Expand Up @@ -1029,6 +1073,17 @@ class RecordBatchReader:

A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch].
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.

This allows Arrow consumers to inspect the data type of this RecordBatchReader.
Then the consumer can ask the producer (in `__arrow_c_stream__`) to cast the
exported data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down Expand Up @@ -1304,6 +1359,17 @@ class Table:
schema: The expected schema of the Arrow Table. If not passed, will be inferred from the data. Mutually exclusive with 'names' argument. Defaults to None.
metadata: Optional metadata for the schema (if schema not passed). Defaults to None.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.

This allows Arrow consumers to inspect the data type of this Table. Then the
consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down
6 changes: 5 additions & 1 deletion pyo3-arrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ use pyo3::types::{PyCapsule, PyTuple, PyType};

use crate::error::PyArrowResult;
use crate::ffi::from_python::utils::import_array_pycapsules;
use crate::ffi::to_array_pycapsules;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
use crate::ffi::{to_array_pycapsules, to_schema_pycapsule};
use crate::input::AnyArray;
use crate::interop::numpy::from_numpy::from_numpy;
use crate::interop::numpy::to_numpy::to_numpy;
Expand Down Expand Up @@ -226,6 +226,10 @@ impl PyArray {
to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema)
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.field.as_ref())
}

fn __eq__(&self, other: &PyArray) -> bool {
self.array.as_ref() == other.array.as_ref() && self.field == other.field
}
Expand Down
6 changes: 5 additions & 1 deletion pyo3-arrow/src/array_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::ffi::from_python::ffi_stream::ArrowArrayStreamReader;
use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::{ArrayIterator, ArrayReader};
use crate::ffi::{to_schema_pycapsule, ArrayIterator, ArrayReader};
use crate::input::AnyArray;
use crate::{PyArray, PyChunkedArray, PyField};

Expand Down Expand Up @@ -103,6 +103,10 @@ impl Display for PyArrayReader {

#[pymethods]
impl PyArrayReader {
fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.field_ref()?.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py mut self,
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/chunked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::chunked::ArrayIterator;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::to_schema_pycapsule;
use crate::input::AnyArray;
use crate::interop::numpy::to_numpy::chunked_to_numpy;
use crate::{PyArray, PyDataType, PyField};
Expand Down Expand Up @@ -261,6 +262,10 @@ impl PyChunkedArray {
chunked_to_numpy(py, chunk_refs.as_slice())
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.field.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py self,
Expand Down
9 changes: 7 additions & 2 deletions pyo3-arrow/src/ffi/to_python/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ pub fn to_array_pycapsules<'py>(
// Note: we don't import a Field directly because the name might not be set.
// https://github.com/apache/arrow-rs/issues/6251
let data_type = DataType::try_from(schema_ptr)?;
let field = Arc::new(Field::new("", data_type, true));
let field =
Arc::new(Field::new("", data_type, true).with_metadata(field.metadata().clone()));

let casted_array = cast(array, field.data_type())?;
(casted_array.to_data(), field)
Expand Down Expand Up @@ -72,10 +73,14 @@ pub fn to_stream_pycapsule<'py>(
if let Some(capsule) = requested_schema {
let schema_ptr = import_schema_pycapsule(&capsule)?;

let existing_field = array_reader.field();

// Note: we don't import a Field directly because the name might not be set.
// https://github.com/apache/arrow-rs/issues/6251
let data_type = DataType::try_from(schema_ptr)?;
let field = Arc::new(Field::new("", data_type, true));
let field = Arc::new(
Field::new("", data_type, true).with_metadata(existing_field.metadata().clone()),
);

let output_field = field.clone();
let array_iter = array_reader.map(move |array| {
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::error::PyArrowResult;
use crate::ffi::from_python::utils::import_array_pycapsules;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
use crate::ffi::to_python::to_array_pycapsules;
use crate::ffi::to_schema_pycapsule;
use crate::input::{AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices};
use crate::schema::display_schema;
use crate::{PyArray, PyField, PySchema};
Expand Down Expand Up @@ -133,6 +134,10 @@ impl PyRecordBatch {
to_array_pycapsules(py, field.into(), &array, requested_schema)
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.0.schema_ref().as_ref())
}

fn __eq__(&self, other: &PyRecordBatch) -> bool {
self.0 == other.0
}
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/record_batch_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::chunked::ArrayIterator;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::to_schema_pycapsule;
use crate::input::AnyRecordBatch;
use crate::schema::display_schema;
use crate::{PyRecordBatch, PySchema, PyTable};
Expand Down Expand Up @@ -116,6 +117,10 @@ impl Display for PyRecordBatchReader {

#[pymethods]
impl PyRecordBatchReader {
fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.schema_ref()?.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py mut self,
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::chunked::ArrayIterator;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::to_schema_pycapsule;
use crate::input::{
AnyArray, AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices,
};
Expand Down Expand Up @@ -191,6 +192,10 @@ impl PyTable {
}
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.schema.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py self,
Expand Down