From 2d1389858129b121cf5b1ffd8b04024ad6c79e1f Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 24 Oct 2023 16:26:53 -0400 Subject: [PATCH] GH-36099: [C++] Add Utf8View and BinaryView to the c data bridge --- cpp/src/arrow/c/bridge.cc | 88 ++++++++++++++++++---- cpp/src/arrow/c/bridge_test.cc | 26 ++++++- dev/archery/archery/cli.py | 2 +- dev/archery/archery/integration/datagen.py | 4 +- docs/source/format/CDataInterface.rst | 12 +++ 5 files changed, 112 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 033371d3d6719..fe4519551de18 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" #include "arrow/util/small_vector.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" @@ -51,6 +53,8 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; +using internal::Zip; + using internal::SmallVector; using internal::StaticVector; @@ -260,7 +264,7 @@ struct SchemaExporter { // Dictionary type: parent struct describes index type, // child dictionary struct describes value type. RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this)); - dict_exporter_.reset(new SchemaExporter()); + dict_exporter_ = std::make_unique(); RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type())); } else { RETURN_NOT_OK(VisitTypeInline(type, this)); @@ -357,10 +361,14 @@ struct SchemaExporter { Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); } + Status Visit(const BinaryViewType& type) { return SetFormat("vz"); } + Status Visit(const StringType& type) { return SetFormat("u"); } Status Visit(const LargeStringType& type) { return SetFormat("U"); } + Status Visit(const StringViewType& type) { return SetFormat("vu"); } + Status Visit(const Date32Type& type) { return SetFormat("tdD"); } Status Visit(const Date64Type& type) { return SetFormat("tdm"); } @@ -517,13 +525,14 @@ namespace { struct ExportedArrayPrivateData : PoolAllocationMixin { // The buffers are owned by the ArrayData member - StaticVector buffers_; + SmallVector buffers_; struct ArrowArray dictionary_; SmallVector children_; SmallVector child_pointers_; std::shared_ptr data_; std::shared_ptr sync_; + std::vector variadic_buffer_sizes_; ExportedArrayPrivateData() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData); @@ -566,15 +575,31 @@ struct ArrayExporter { --n_buffers; ++buffers_begin; } + + bool need_variadic_buffer_sizes = + data->type->id() == Type::BINARY_VIEW || data->type->id() == Type::STRING_VIEW; + if (need_variadic_buffer_sizes) { + ++n_buffers; + } + export_.buffers_.resize(n_buffers); std::transform(buffers_begin, data->buffers.end(), export_.buffers_.begin(), [](const std::shared_ptr& buffer) -> const void* { return buffer ? buffer->data() : nullptr; }); + if (need_variadic_buffer_sizes) { + auto variadic_buffers = util::span(data->buffers).subspan(2); + export_.variadic_buffer_sizes_.resize(variadic_buffers.size()); + for (auto [size, buf] : Zip(export_.variadic_buffer_sizes_, variadic_buffers)) { + size = buf->size(); + } + export_.buffers_.back() = export_.variadic_buffer_sizes_.data(); + } + // Export dictionary if (data->dictionary != nullptr) { - dict_exporter_.reset(new ArrayExporter()); + dict_exporter_ = std::make_unique(); RETURN_NOT_OK(dict_exporter_->Export(data->dictionary)); } @@ -791,7 +816,7 @@ Status InvalidFormatString(std::string_view v) { class FormatStringParser { public: - FormatStringParser() {} + FormatStringParser() = default; explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {} @@ -937,8 +962,6 @@ Result DecodeMetadata(const char* metadata) { } struct SchemaImporter { - SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {} - Status Import(struct ArrowSchema* src) { if (ArrowSchemaIsReleased(src)) { return Status::Invalid("Cannot import released ArrowSchema"); @@ -1064,6 +1087,8 @@ struct SchemaImporter { return ProcessPrimitive(binary()); case 'Z': return ProcessPrimitive(large_binary()); + case 'v': + return ProcessBinaryView(); case 'w': return ProcessFixedSizeBinary(); case 'd': @@ -1076,6 +1101,17 @@ struct SchemaImporter { return f_parser_.Invalid(); } + Status ProcessBinaryView() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'z': + return ProcessPrimitive(binary_view()); + case 'u': + return ProcessPrimitive(utf8_view()); + } + return f_parser_.Invalid(); + } + Status ProcessTemporal() { RETURN_NOT_OK(f_parser_.CheckHasNext()); switch (f_parser_.Next()) { @@ -1337,8 +1373,8 @@ struct SchemaImporter { return Status::OK(); } - struct ArrowSchema* c_struct_; - SchemaExportGuard guard_; + struct ArrowSchema* c_struct_{nullptr}; + SchemaExportGuard guard_{nullptr}; FormatStringParser f_parser_; int64_t recursion_level_; std::vector child_importers_; @@ -1406,7 +1442,7 @@ class ImportedBuffer : public Buffer { std::shared_ptr import) : Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {} - ~ImportedBuffer() override {} + ~ImportedBuffer() override = default; std::shared_ptr device_sync_event() override { return import_->device_sync_; @@ -1418,9 +1454,7 @@ class ImportedBuffer : public Buffer { struct ArrayImporter { explicit ArrayImporter(const std::shared_ptr& type) - : type_(type), - zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)), - device_type_(DeviceAllocationType::kCPU) {} + : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) { ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id)); @@ -1568,6 +1602,10 @@ struct ArrayImporter { Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); } + Status Visit(const StringViewType& type) { return ImportBinaryView(type); } + + Status Visit(const BinaryViewType& type) { return ImportBinaryView(type); } + Status Visit(const ListType& type) { return ImportListLike(type); } Status Visit(const LargeListType& type) { return ImportListLike(type); } @@ -1646,6 +1684,28 @@ struct ArrayImporter { return Status::OK(); } + Status ImportBinaryView(const BinaryViewType&) { + RETURN_NOT_OK(CheckNoChildren()); + if (c_struct_->n_buffers < 3) { + return Status::Invalid("Expected at least 3 buffers for imported type ", + type_->ToString(), ", ArrowArray struct has ", + c_struct_->n_buffers); + } + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportFixedSizeBuffer(1, BinaryViewType::kSize)); + + // The last C data buffer stores buffer sizes, and shouldn't be imported + auto* buffer_sizes = + static_cast(c_struct_->buffers[c_struct_->n_buffers - 1]); + + for (int32_t buffer_id = 2; buffer_id < c_struct_->n_buffers - 1; ++buffer_id) { + RETURN_NOT_OK(ImportBuffer(buffer_id, buffer_sizes[buffer_id - 2])); + } + data_->buffers.pop_back(); + return Status::OK(); + } + template Status ImportStringLike(const StringType& type) { RETURN_NOT_OK(CheckNoChildren()); @@ -1790,7 +1850,7 @@ struct ArrayImporter { std::shared_ptr zero_size_buffer_; std::shared_ptr memory_mgr_; - DeviceAllocationType device_type_; + DeviceAllocationType device_type_{}; }; } // namespace @@ -1996,7 +2056,7 @@ class ArrayStreamBatchReader : public RecordBatchReader { DCHECK(!ArrowArrayStreamIsReleased(&stream_)); } - ~ArrayStreamBatchReader() { + ~ArrayStreamBatchReader() override { if (!ArrowArrayStreamIsReleased(&stream_)) { ArrowArrayStreamRelease(&stream_); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bd0e498a9f332..157a1f80a1660 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -42,6 +42,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed #ifdef ARROW_COMPUTE @@ -57,6 +58,7 @@ using internal::ArrayStreamExportTraits; using internal::checked_cast; using internal::SchemaExportGuard; using internal::SchemaExportTraits; +using internal::Zip; template struct ExportTraits {}; @@ -353,6 +355,8 @@ TEST_F(TestSchemaExport, Primitive) { TestPrimitive(large_binary(), "Z"); TestPrimitive(utf8(), "u"); TestPrimitive(large_utf8(), "U"); + TestPrimitive(binary_view(), "vz"); + TestPrimitive(utf8_view(), "vu"); TestPrimitive(decimal(16, 4), "d:16,4"); TestPrimitive(decimal256(16, 4), "d:16,4,256"); @@ -556,12 +560,24 @@ struct ArrayExportChecker { --expected_n_buffers; ++expected_buffers; } - ASSERT_EQ(c_export->n_buffers, expected_n_buffers); + bool has_variadic_buffer_sizes = expected_data.type->id() == Type::STRING_VIEW || + expected_data.type->id() == Type::BINARY_VIEW; + ASSERT_EQ(c_export->n_buffers, expected_n_buffers + has_variadic_buffer_sizes); ASSERT_NE(c_export->buffers, nullptr); - for (int64_t i = 0; i < c_export->n_buffers; ++i) { + + for (int64_t i = 0; i < expected_n_buffers; ++i) { auto expected_ptr = expected_buffers[i] ? expected_buffers[i]->data() : nullptr; ASSERT_EQ(c_export->buffers[i], expected_ptr); } + if (has_variadic_buffer_sizes) { + auto variadic_buffers = util::span(expected_data.buffers).subspan(2); + auto variadic_buffer_sizes = util::span( + static_cast(c_export->buffers[c_export->n_buffers - 1]), + variadic_buffers.size()); + for (auto [buf, size] : Zip(variadic_buffers, variadic_buffer_sizes)) { + ASSERT_EQ(buf->size(), size); + } + } if (expected_data.dictionary != nullptr) { // Recurse into dictionary @@ -874,6 +890,8 @@ TEST_F(TestArrayExport, Primitive) { TestPrimitive(large_binary(), R"(["foo", "bar", null])"); TestPrimitive(utf8(), R"(["foo", "bar", null])"); TestPrimitive(large_utf8(), R"(["foo", "bar", null])"); + TestPrimitive(binary_view(), R"(["foo", "bar", null])"); + TestPrimitive(utf8_view(), R"(["foo", "bar", null])"); TestPrimitive(decimal(16, 4), R"(["1234.5670", null])"); TestPrimitive(decimal256(16, 4), R"(["1234.5670", null])"); @@ -1891,6 +1909,10 @@ TEST_F(TestSchemaImport, String) { CheckImport(large_utf8()); FillPrimitive("Z"); CheckImport(large_binary()); + FillPrimitive("vu"); + CheckImport(utf8_view()); + FillPrimitive("vz"); + CheckImport(binary_view()); FillPrimitive("w:3"); CheckImport(fixed_size_binary(3)); diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 7a3b45f9788e6..7211b6c87a055 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -42,7 +42,7 @@ BOOL = ArrowBool() -@click.group() +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--debug", type=BOOL, is_flag=True, default=False, help="Increase logging with debugging output.") @click.option("--pdb", type=BOOL, is_flag=True, default=False, diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 1ce2775c160b8..dbb0256be90e0 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1860,9 +1860,7 @@ def _temp_path(): .skip_tester('Go') .skip_tester('Java') .skip_tester('JS') - .skip_tester('Rust') - .skip_format(SKIP_C_SCHEMA, 'C++') - .skip_format(SKIP_C_ARRAY, 'C++'), + .skip_tester('Rust'), generate_extension_case() .skip_tester('C#') diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e0884686acf6c..a1971df8ea02f 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -140,10 +140,14 @@ strings: +-----------------+---------------------------------------------------+------------+ | ``Z`` | large binary | | +-----------------+---------------------------------------------------+------------+ +| ``vz`` | binary view | | ++-----------------+---------------------------------------------------+------------+ | ``u`` | utf-8 string | | +-----------------+---------------------------------------------------+------------+ | ``U`` | large utf-8 string | | +-----------------+---------------------------------------------------+------------+ +| ``vu`` | utf-8 view | | ++-----------------+---------------------------------------------------+------------+ | ``d:19,10`` | decimal128 [precision 19, scale 10] | | +-----------------+---------------------------------------------------+------------+ | ``d:19,10,NNN`` | decimal bitwidth = NNN [precision 19, scale 10] | | @@ -542,6 +546,14 @@ parameterized extension types). The ``ArrowArray`` structure exported from an extension array simply points to the storage data of the extension array. +Binary view arrays +------------------ + +For binary or utf-8 view arrays, an extra buffer is appended which stores +the lengths of each variadic data buffer as ``int64_t``. This buffer is +necessary since these buffer lengths are not trivially extractable from +other data in an array of binary or utf-8 view type. + .. _c-data-interface-semantics: Semantics