diff --git a/CMakeLists.txt b/CMakeLists.txt index f4f9267..bac8c32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs" # WARNING: the FFI headers are currently pinned due to the C linkage issue of the c++ headers. Currently, when bumping # the kernel version, the produced header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying the fix - GIT_TAG v0.3.1 + GIT_TAG v0.4.0 # Prints the env variables passed to the cargo build to the terminal, useful in debugging because passing them # through CMake is an error-prone mess CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} env diff --git a/src/include/delta_kernel_ffi.hpp b/src/include/delta_kernel_ffi.hpp index 58be58c..3c46bde 100644 --- a/src/include/delta_kernel_ffi.hpp +++ b/src/include/delta_kernel_ffi.hpp @@ -51,6 +51,7 @@ enum class KernelError { InvalidStructDataError, InternalError, InvalidExpression, + InvalidLogPath, }; struct CStringMap; @@ -107,10 +108,10 @@ struct KernelRowIndexArray { /// An accompanying [`HandleDescriptor`] trait defines the behavior of each handle type: /// /// * The true underlying ("target") type the handle represents. For safety reasons, target type -/// must always be [`Send`]. +/// must always be [`Send`]. /// /// * Mutable (`Box`-like) vs. shared (`Arc`-like). For safety reasons, the target type of a -/// shared handle must always be [`Send`]+[`Sync`]. +/// shared handle must always be [`Send`]+[`Sync`]. /// /// * Sized vs. unsized. Sized types allow handle operations to be implemented more efficiently. /// @@ -204,87 +205,10 @@ using NullableCvoid = void*; /// function is that `kernel_str` is _only_ valid until the return from this function using AllocateStringFn = NullableCvoid(*)(KernelStringSlice kernel_str); -/// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own -/// representation of a schema from a particular schema within kernel. -/// -/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a -/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier -/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the -/// future. -/// -/// Every schema element the kernel visits belongs to some list of "sibling" elements. The schema -/// itself is a list of schema elements, and every complex type (struct, map, array) contains a list -/// of "child" elements. -/// 1. Before visiting schema or any complex type, the kernel asks the engine to allocate a list to -/// hold its children -/// 2. When visiting any schema element, the kernel passes its parent's "child list" as the -/// "sibling list" the element should be appended to: -/// - For the top-level schema, visit each top-level column, passing the column's name and type -/// - For a struct, first visit each struct field, passing the field's name, type, nullability, -/// and metadata -/// - For a map, visit the key and value, passing its special name ("map_key" or "map_value"), -/// type, and value nullability (keys are never nullable) -/// - For a list, visit the element, passing its special name ("array_element"), type, and -/// nullability -/// 3. When visiting a complex schema element, the kernel also passes the "child list" containing -/// that element's (already-visited) children. -/// 4. The [`visit_schema`] method returns the id of the list of top-level columns -struct EngineSchemaVisitor { - /// opaque state pointer - void *data; - /// Creates a new field list, optionally reserving capacity up front - uintptr_t (*make_field_list)(void *data, uintptr_t reserve); - /// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a - /// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`. - void (*visit_struct)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uintptr_t child_list_id); - /// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list - /// with the array's element type - void (*visit_array)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - bool contains_null, - uintptr_t child_list_id); - /// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list - /// where the first element is the map's key type and the second element is the - /// map's value type - void (*visit_map)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - bool value_contains_null, - uintptr_t child_list_id); - /// visit a `decimal` with the specified `precision` and `scale` - void (*visit_decimal)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uint8_t precision, - uint8_t scale); - /// Visit a `string` belonging to the list identified by `sibling_list_id`. - void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `long` belonging to the list identified by `sibling_list_id`. - void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit an `integer` belonging to the list identified by `sibling_list_id`. - void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `short` belonging to the list identified by `sibling_list_id`. - void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `byte` belonging to the list identified by `sibling_list_id`. - void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `float` belonging to the list identified by `sibling_list_id`. - void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `double` belonging to the list identified by `sibling_list_id`. - void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. - void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit `binary` belonging to the list identified by `sibling_list_id`. - void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `date` belonging to the list identified by `sibling_list_id`. - void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. - void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. - void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); +struct FileMeta { + KernelStringSlice path; + int64_t last_modified; + uintptr_t size; }; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap @@ -297,12 +221,6 @@ struct EngineIterator { const void *(*get_next)(void *data); }; -struct FileMeta { - KernelStringSlice path; - int64_t last_modified; - uintptr_t size; -}; - /// ABI-compatible struct for ArrowArray from C Data Interface /// See /// @@ -341,6 +259,7 @@ struct FFI_ArrowSchema { const char *format; const char *name; const char *metadata; + /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags) int64_t flags; int64_t n_children; FFI_ArrowSchema **children; @@ -390,7 +309,7 @@ using CScanCallback = void(*)(NullableCvoid engine_context, const DvInfo *dv_info, const CStringMap *partition_map); - // This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163 +// This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163 struct im_an_unused_struct_that_tricks_msvc_into_compilation { ExternResult field; ExternResult field2; @@ -405,6 +324,89 @@ struct im_an_unused_struct_that_tricks_msvc_into_compilation { ExternResult field11; }; +/// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own +/// representation of a schema from a particular schema within kernel. +/// +/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a +/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier +/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the +/// future. +/// +/// Every schema element the kernel visits belongs to some list of "sibling" elements. The schema +/// itself is a list of schema elements, and every complex type (struct, map, array) contains a list +/// of "child" elements. +/// 1. Before visiting schema or any complex type, the kernel asks the engine to allocate a list to +/// hold its children +/// 2. When visiting any schema element, the kernel passes its parent's "child list" as the +/// "sibling list" the element should be appended to: +/// - For the top-level schema, visit each top-level column, passing the column's name and type +/// - For a struct, first visit each struct field, passing the field's name, type, nullability, +/// and metadata +/// - For a map, visit the key and value, passing its special name ("map_key" or "map_value"), +/// type, and value nullability (keys are never nullable) +/// - For a list, visit the element, passing its special name ("array_element"), type, and +/// nullability +/// 3. When visiting a complex schema element, the kernel also passes the "child list" containing +/// that element's (already-visited) children. +/// 4. The [`visit_schema`] method returns the id of the list of top-level columns +struct EngineSchemaVisitor { + /// opaque state pointer + void *data; + /// Creates a new field list, optionally reserving capacity up front + uintptr_t (*make_field_list)(void *data, uintptr_t reserve); + /// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a + /// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`. + void (*visit_struct)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + uintptr_t child_list_id); + /// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list + /// with the array's element type + void (*visit_array)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + bool contains_null, + uintptr_t child_list_id); + /// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list + /// where the first element is the map's key type and the second element is the + /// map's value type + void (*visit_map)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + bool value_contains_null, + uintptr_t child_list_id); + /// visit a `decimal` with the specified `precision` and `scale` + void (*visit_decimal)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + uint8_t precision, + uint8_t scale); + /// Visit a `string` belonging to the list identified by `sibling_list_id`. + void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `long` belonging to the list identified by `sibling_list_id`. + void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit an `integer` belonging to the list identified by `sibling_list_id`. + void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `short` belonging to the list identified by `sibling_list_id`. + void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `byte` belonging to the list identified by `sibling_list_id`. + void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `float` belonging to the list identified by `sibling_list_id`. + void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `double` belonging to the list identified by `sibling_list_id`. + void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. + void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit `binary` belonging to the list identified by `sibling_list_id`. + void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `date` belonging to the list identified by `sibling_list_id`. + void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. + void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. + void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); +}; + extern "C" { /// # Safety @@ -516,15 +518,34 @@ bool string_slice_next(Handle data, /// Caller is responsible for (at most once) passing a valid pointer to a [`StringSliceIterator`] void free_string_slice_data(Handle data); -/// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the -/// documentation of [`EngineSchemaVisitor`] for a description of how this visitor works. +/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The +/// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the +/// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_ +/// call [`free_engine_data`] on it. /// -/// This method returns the id of the list allocated to hold the top level schema columns. +/// # Safety /// +/// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by +/// [`free_read_result_iter`]. The visitor function pointer must be non-null. +ExternResult read_result_next(Handle data, + NullableCvoid engine_context, + void (*engine_visitor)(NullableCvoid engine_context, + Handle engine_data)); + +/// Free the memory from the passed read result iterator /// # Safety /// -/// Caller is responsible for passing a valid snapshot handle and schema visitor. -uintptr_t visit_schema(Handle snapshot, EngineSchemaVisitor *visitor); +/// Caller is responsible for (at most once) passing a valid pointer returned by a call to +/// [`read_parquet_file`]. +void free_read_result_iter(Handle data); + +/// Use the specified engine's [`delta_kernel::ParquetHandler`] to read the specified file. +/// +/// # Safety +/// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta` +ExternResult> read_parquet_file(Handle engine, + const FileMeta *file, + Handle physical_schema); uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); @@ -568,35 +589,6 @@ uintptr_t visit_expression_literal_double(KernelExpressionVisitorState *state, d uintptr_t visit_expression_literal_bool(KernelExpressionVisitorState *state, bool value); -/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The -/// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the -/// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_ -/// call [`free_engine_data`] on it. -/// -/// # Safety -/// -/// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by -/// [`free_read_result_iter`]. The visitor function pointer must be non-null. -ExternResult read_result_next(Handle data, - NullableCvoid engine_context, - void (*engine_visitor)(NullableCvoid engine_context, - Handle engine_data)); - -/// Free the memory from the passed read result iterator -/// # Safety -/// -/// Caller is responsible for (at most once) passing a valid pointer returned by a call to -/// [`read_parquet_file`]. -void free_read_result_iter(Handle data); - -/// Use the specified engine's [`delta_kernel::ParquetHandler`] to read the specified file. -/// -/// # Safety -/// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta` -ExternResult> read_parquet_file(Handle engine, - const FileMeta *file, - Handle physical_schema); - /// Get the number of rows in an engine data /// /// # Safety @@ -737,6 +729,16 @@ void visit_scan_data(Handle data, NullableCvoid engine_context, CScanCallback callback); +/// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the +/// documentation of [`EngineSchemaVisitor`] for a description of how this visitor works. +/// +/// This method returns the id of the list allocated to hold the top level schema columns. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid snapshot handle and schema visitor. +uintptr_t visit_schema(Handle snapshot, EngineSchemaVisitor *visitor); + } // extern "C" } // namespace ffi