Skip to content

Commit

Permalink
Merge pull request #111 from samansmink/bump-kernel-to-v0.4.0
Browse files Browse the repository at this point in the history
bump kernel to v0.4.0
  • Loading branch information
samansmink authored Oct 25, 2024
2 parents 8642253 + 182654f commit b7333c0
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 125 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs"
# WARNING: the FFI headers are currently pinned due to the C linkage issue of the c++ headers. Currently, when bumping
# the kernel version, the produced header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying the fix
GIT_TAG v0.3.1
GIT_TAG v0.4.0
# Prints the env variables passed to the cargo build to the terminal, useful in debugging because passing them
# through CMake is an error-prone mess
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} env
Expand Down
250 changes: 126 additions & 124 deletions src/include/delta_kernel_ffi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ enum class KernelError {
InvalidStructDataError,
InternalError,
InvalidExpression,
InvalidLogPath,
};

struct CStringMap;
Expand Down Expand Up @@ -107,10 +108,10 @@ struct KernelRowIndexArray {
/// An accompanying [`HandleDescriptor`] trait defines the behavior of each handle type:
///
/// * The true underlying ("target") type the handle represents. For safety reasons, target type
/// must always be [`Send`].
/// must always be [`Send`].
///
/// * Mutable (`Box`-like) vs. shared (`Arc`-like). For safety reasons, the target type of a
/// shared handle must always be [`Send`]+[`Sync`].
/// shared handle must always be [`Send`]+[`Sync`].
///
/// * Sized vs. unsized. Sized types allow handle operations to be implemented more efficiently.
///
Expand Down Expand Up @@ -204,87 +205,10 @@ using NullableCvoid = void*;
/// function is that `kernel_str` is _only_ valid until the return from this function
using AllocateStringFn = NullableCvoid(*)(KernelStringSlice kernel_str);

/// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own
/// representation of a schema from a particular schema within kernel.
///
/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a
/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier
/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the
/// future.
///
/// Every schema element the kernel visits belongs to some list of "sibling" elements. The schema
/// itself is a list of schema elements, and every complex type (struct, map, array) contains a list
/// of "child" elements.
/// 1. Before visiting schema or any complex type, the kernel asks the engine to allocate a list to
/// hold its children
/// 2. When visiting any schema element, the kernel passes its parent's "child list" as the
/// "sibling list" the element should be appended to:
/// - For the top-level schema, visit each top-level column, passing the column's name and type
/// - For a struct, first visit each struct field, passing the field's name, type, nullability,
/// and metadata
/// - For a map, visit the key and value, passing its special name ("map_key" or "map_value"),
/// type, and value nullability (keys are never nullable)
/// - For a list, visit the element, passing its special name ("array_element"), type, and
/// nullability
/// 3. When visiting a complex schema element, the kernel also passes the "child list" containing
/// that element's (already-visited) children.
/// 4. The [`visit_schema`] method returns the id of the list of top-level columns
struct EngineSchemaVisitor {
/// opaque state pointer
void *data;
/// Creates a new field list, optionally reserving capacity up front
uintptr_t (*make_field_list)(void *data, uintptr_t reserve);
/// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a
/// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`.
void (*visit_struct)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
uintptr_t child_list_id);
/// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list
/// with the array's element type
void (*visit_array)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
bool contains_null,
uintptr_t child_list_id);
/// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list
/// where the first element is the map's key type and the second element is the
/// map's value type
void (*visit_map)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
bool value_contains_null,
uintptr_t child_list_id);
/// visit a `decimal` with the specified `precision` and `scale`
void (*visit_decimal)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
uint8_t precision,
uint8_t scale);
/// Visit a `string` belonging to the list identified by `sibling_list_id`.
void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `long` belonging to the list identified by `sibling_list_id`.
void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit an `integer` belonging to the list identified by `sibling_list_id`.
void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `short` belonging to the list identified by `sibling_list_id`.
void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `byte` belonging to the list identified by `sibling_list_id`.
void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `float` belonging to the list identified by `sibling_list_id`.
void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `double` belonging to the list identified by `sibling_list_id`.
void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `boolean` belonging to the list identified by `sibling_list_id`.
void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit `binary` belonging to the list identified by `sibling_list_id`.
void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `date` belonging to the list identified by `sibling_list_id`.
void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `timestamp` belonging to the list identified by `sibling_list_id`.
void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`.
void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
struct FileMeta {
KernelStringSlice path;
int64_t last_modified;
uintptr_t size;
};

/// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap
Expand All @@ -297,12 +221,6 @@ struct EngineIterator {
const void *(*get_next)(void *data);
};

struct FileMeta {
KernelStringSlice path;
int64_t last_modified;
uintptr_t size;
};

/// ABI-compatible struct for ArrowArray from C Data Interface
/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
///
Expand Down Expand Up @@ -341,6 +259,7 @@ struct FFI_ArrowSchema {
const char *format;
const char *name;
const char *metadata;
/// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags)
int64_t flags;
int64_t n_children;
FFI_ArrowSchema **children;
Expand Down Expand Up @@ -390,7 +309,7 @@ using CScanCallback = void(*)(NullableCvoid engine_context,
const DvInfo *dv_info,
const CStringMap *partition_map);

// This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163
// This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163
struct im_an_unused_struct_that_tricks_msvc_into_compilation {
ExternResult<KernelBoolSlice> field;
ExternResult<bool> field2;
Expand All @@ -405,6 +324,89 @@ struct im_an_unused_struct_that_tricks_msvc_into_compilation {
ExternResult<KernelRowIndexArray> field11;
};

/// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own
/// representation of a schema from a particular schema within kernel.
///
/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a
/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier
/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the
/// future.
///
/// Every schema element the kernel visits belongs to some list of "sibling" elements. The schema
/// itself is a list of schema elements, and every complex type (struct, map, array) contains a list
/// of "child" elements.
/// 1. Before visiting schema or any complex type, the kernel asks the engine to allocate a list to
/// hold its children
/// 2. When visiting any schema element, the kernel passes its parent's "child list" as the
/// "sibling list" the element should be appended to:
/// - For the top-level schema, visit each top-level column, passing the column's name and type
/// - For a struct, first visit each struct field, passing the field's name, type, nullability,
/// and metadata
/// - For a map, visit the key and value, passing its special name ("map_key" or "map_value"),
/// type, and value nullability (keys are never nullable)
/// - For a list, visit the element, passing its special name ("array_element"), type, and
/// nullability
/// 3. When visiting a complex schema element, the kernel also passes the "child list" containing
/// that element's (already-visited) children.
/// 4. The [`visit_schema`] method returns the id of the list of top-level columns
struct EngineSchemaVisitor {
/// opaque state pointer
void *data;
/// Creates a new field list, optionally reserving capacity up front
uintptr_t (*make_field_list)(void *data, uintptr_t reserve);
/// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a
/// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`.
void (*visit_struct)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
uintptr_t child_list_id);
/// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list
/// with the array's element type
void (*visit_array)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
bool contains_null,
uintptr_t child_list_id);
/// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list
/// where the first element is the map's key type and the second element is the
/// map's value type
void (*visit_map)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
bool value_contains_null,
uintptr_t child_list_id);
/// visit a `decimal` with the specified `precision` and `scale`
void (*visit_decimal)(void *data,
uintptr_t sibling_list_id,
KernelStringSlice name,
uint8_t precision,
uint8_t scale);
/// Visit a `string` belonging to the list identified by `sibling_list_id`.
void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `long` belonging to the list identified by `sibling_list_id`.
void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit an `integer` belonging to the list identified by `sibling_list_id`.
void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `short` belonging to the list identified by `sibling_list_id`.
void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `byte` belonging to the list identified by `sibling_list_id`.
void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `float` belonging to the list identified by `sibling_list_id`.
void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `double` belonging to the list identified by `sibling_list_id`.
void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `boolean` belonging to the list identified by `sibling_list_id`.
void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit `binary` belonging to the list identified by `sibling_list_id`.
void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `date` belonging to the list identified by `sibling_list_id`.
void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `timestamp` belonging to the list identified by `sibling_list_id`.
void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
/// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`.
void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name);
};

extern "C" {

/// # Safety
Expand Down Expand Up @@ -516,15 +518,34 @@ bool string_slice_next(Handle<StringSliceIterator> data,
/// Caller is responsible for (at most once) passing a valid pointer to a [`StringSliceIterator`]
void free_string_slice_data(Handle<StringSliceIterator> data);

/// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the
/// documentation of [`EngineSchemaVisitor`] for a description of how this visitor works.
/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The
/// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the
/// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_
/// call [`free_engine_data`] on it.
///
/// This method returns the id of the list allocated to hold the top level schema columns.
/// # Safety
///
/// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by
/// [`free_read_result_iter`]. The visitor function pointer must be non-null.
ExternResult<bool> read_result_next(Handle<ExclusiveFileReadResultIterator> data,
NullableCvoid engine_context,
void (*engine_visitor)(NullableCvoid engine_context,
Handle<ExclusiveEngineData> engine_data));

/// Free the memory from the passed read result iterator
/// # Safety
///
/// Caller is responsible for passing a valid snapshot handle and schema visitor.
uintptr_t visit_schema(Handle<SharedSnapshot> snapshot, EngineSchemaVisitor *visitor);
/// Caller is responsible for (at most once) passing a valid pointer returned by a call to
/// [`read_parquet_file`].
void free_read_result_iter(Handle<ExclusiveFileReadResultIterator> data);

/// Use the specified engine's [`delta_kernel::ParquetHandler`] to read the specified file.
///
/// # Safety
/// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta`
ExternResult<Handle<ExclusiveFileReadResultIterator>> read_parquet_file(Handle<SharedExternEngine> engine,
const FileMeta *file,
Handle<SharedSchema> physical_schema);

uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children);

Expand Down Expand Up @@ -568,35 +589,6 @@ uintptr_t visit_expression_literal_double(KernelExpressionVisitorState *state, d

uintptr_t visit_expression_literal_bool(KernelExpressionVisitorState *state, bool value);

/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The
/// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the
/// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_
/// call [`free_engine_data`] on it.
///
/// # Safety
///
/// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by
/// [`free_read_result_iter`]. The visitor function pointer must be non-null.
ExternResult<bool> read_result_next(Handle<ExclusiveFileReadResultIterator> data,
NullableCvoid engine_context,
void (*engine_visitor)(NullableCvoid engine_context,
Handle<ExclusiveEngineData> engine_data));

/// Free the memory from the passed read result iterator
/// # Safety
///
/// Caller is responsible for (at most once) passing a valid pointer returned by a call to
/// [`read_parquet_file`].
void free_read_result_iter(Handle<ExclusiveFileReadResultIterator> data);

/// Use the specified engine's [`delta_kernel::ParquetHandler`] to read the specified file.
///
/// # Safety
/// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta`
ExternResult<Handle<ExclusiveFileReadResultIterator>> read_parquet_file(Handle<SharedExternEngine> engine,
const FileMeta *file,
Handle<SharedSchema> physical_schema);

/// Get the number of rows in an engine data
///
/// # Safety
Expand Down Expand Up @@ -737,6 +729,16 @@ void visit_scan_data(Handle<ExclusiveEngineData> data,
NullableCvoid engine_context,
CScanCallback callback);

/// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the
/// documentation of [`EngineSchemaVisitor`] for a description of how this visitor works.
///
/// This method returns the id of the list allocated to hold the top level schema columns.
///
/// # Safety
///
/// Caller is responsible for passing a valid snapshot handle and schema visitor.
uintptr_t visit_schema(Handle<SharedSnapshot> snapshot, EngineSchemaVisitor *visitor);

} // extern "C"

} // namespace ffi

0 comments on commit b7333c0

Please sign in to comment.