From c7737af5ebe9b404c6b7eed6006785ea41337ca1 Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Thu, 24 Sep 2020 18:45:42 +0100 Subject: [PATCH] [0.4.0] Features needed for runtime .dic compilation in gecko. Added `compress` param to `read_dic_file`, to choose between faster compilation or smaller result; added FFI functions to compile .dic files into state tables and return as a memory buffer for Gecko to manage further. Simplified `builder` API to just expose a single `compile()` function instead of a two-step process. --- Cargo.toml | 2 +- README.md | 11 +++++- src/bin/hyf_compile.rs | 4 +- src/builder.rs | 32 +++++++++++----- src/ffi.rs | 87 +++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 4 +- 6 files changed, 123 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 76380c3..6afda6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "mapped_hyph" description = "Hyphenation using precompiled memory-mapped tables" -version = "0.3.0" +version = "0.4.0" authors = ["Jonathan Kew "] license = "MIT/Apache-2.0" edition = "2018" diff --git a/README.md b/README.md index 2133bd7..c4251e5 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ comments in the source. ## C and C++ bindings -See the `mapped_hyph.h` header for C/C++ APIs that can be used to load hyphenation files +See src/ffi.rs for C/C++ APIs that can be used to load hyphenation files and to locate valid hyphenation positions in a word. ## Sample programs @@ -63,6 +63,15 @@ included here, as it is handy for testing purposes.) ## Release Notes +### 0.4.0 + +* Added a boolean `compress` param to the pattern compiler to control whether + it attempts to compress the compiled table by merging duplicate states (which + takes significant extra time). + +* Added FFI functions to compile hyphenation tables from a file path or a buffer, + intended for use from Gecko. + ### 0.3.0 * Switched from MPL2 to Apache2/MIT dual license. diff --git a/src/bin/hyf_compile.rs b/src/bin/hyf_compile.rs index 4e16711..05b7e4c 100644 --- a/src/bin/hyf_compile.rs +++ b/src/bin/hyf_compile.rs @@ -1,4 +1,4 @@ -// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 std::io::Result<()> { if args.len() == 3 { let in_file = File::open(&args[1])?; let mut out_file = File::create(&args[2])?; - mapped_hyph::builder::write_hyf_file(&mut out_file, mapped_hyph::builder::read_dic_file(&in_file))?; + mapped_hyph::builder::compile(&in_file, &mut out_file, true)?; } else { println!("usage: hyf_compile "); } diff --git a/src/builder.rs b/src/builder.rs index 7a13947..3ad4769 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -1,4 +1,4 @@ -// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 , str_to_state: HashMap,i32>, encoding: Option, @@ -349,7 +349,7 @@ impl LevelBuilder { /// machine transitions, etc. /// The returned Vec can be passed to write_hyf_file() to generate a flattened /// representation of the state machine in mapped_hyph's binary format. -pub fn read_dic_file(dic_file: T) -> Vec { +fn read_dic_file(dic_file: T, compress: bool) -> Vec { let reader = BufReader::new(dic_file); let mut builders = Vec::::new(); @@ -439,9 +439,11 @@ pub fn read_dic_file(dic_file: T) -> Vec { } } - // Merge duplicate states to reduce size. - for builder in &mut builders { - builder.merge_duplicate_states(); + if compress { + // Merge duplicate states to reduce size. + for builder in &mut builders { + builder.merge_duplicate_states(); + } } builders @@ -449,7 +451,10 @@ pub fn read_dic_file(dic_file: T) -> Vec { /// Write out the state machines representing a set of hyphenation rules /// to the given output stream. -pub fn write_hyf_file(hyf_file: &mut T, levels: Vec) -> std::io::Result<()> { +fn write_hyf_file(hyf_file: &mut T, levels: Vec) -> std::io::Result<()> { + if levels.is_empty() { + return Err(Error::from(ErrorKind::InvalidData)); + } let mut flattened = vec![]; for level in levels { flattened.push(level.flatten()); @@ -471,3 +476,10 @@ pub fn write_hyf_file(hyf_file: &mut T, levels: Vec) -> } Ok(()) } + +/// The public API to the compilation process: reads `dic_file` and writes compiled tables +/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the +/// size of the output is performed. +pub fn compile(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> { + write_hyf_file(hyf_file, read_dic_file(dic_file, compress)) +} diff --git a/src/ffi.rs b/src/ffi.rs index 6e37596..d6c8182 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -1,4 +1,4 @@ -// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 (word: *const c_char, word_len: u32, @@ -163,3 +168,83 @@ pub unsafe extern "C" fn mapped_hyph_is_valid_hyphenator(dic_buf: *const u8, dic let dic = Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize)); dic.is_valid_hyphenator() } + +/// C-callable function to free a CompiledData object created by +/// a `mapped_hyph_compile_...` function (below). +/// +/// # Safety +/// The `data` parameter must be a `CompiledData` pointer obtained from +/// a `mapped_hyph_compile_...` function, and not previously freed. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_free_compiled_data(data: *mut CompiledData) { + Box::from_raw(data); +} + +// Helper for the compilation functions (from either memory buffer or file path). +fn compile_and_wrap(input: T, compress: bool) -> *const CompiledData { + let mut compiled: Vec = vec![]; + if super::builder::compile(input, &mut compiled, compress).is_err() { + return std::ptr::null(); + } + compiled.shrink_to_fit(); + + // Create a persistent heap reference to the compiled data, and return a pointer to it. + Box::into_raw(Box::new(compiled)) as *const CompiledData +} + +/// C-callable function to compile hyphenation patterns from `pattern_buf` and return +/// the compiled data in a memory buffer, suitable to be stored somewhere or passed +/// to `mapped_hyph_find_hyphen_values_raw` to perform hyphenation. +/// +/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`. +/// +/// # Safety +/// The `pattern_buf` parameter must be a valid pointer to a memory block of size +/// at least `pattern_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compile_buffer(pattern_buf: *const u8, pattern_len: u32, compress: bool) -> *const CompiledData { + compile_and_wrap(slice::from_raw_parts(pattern_buf, pattern_len as usize), compress) +} + +/// C-callable function to compile hyphenation patterns from a file to a memory buffer. +/// +/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`. +/// +/// # Safety +/// The given `path` must be a valid pointer to a NUL-terminated (C-style) string. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compile_file(path: *const c_char, compress: bool) -> *const CompiledData { + // Try to open the file at the given path, returning null on failure. + let path_str = match CStr::from_ptr(path).to_str() { + Ok(str) => str, + Err(_) => return std::ptr::null(), + }; + let in_file = match File::open(path_str) { + Ok(file) => file, + Err(_) => return std::ptr::null(), + }; + compile_and_wrap(&in_file, compress) +} + +/// Get the size of the compiled table buffer in a `CompiledData` object. +/// +/// # Safety +/// The `data` parameter must be a `CompiledData` pointer obtained from +/// a `mapped_hyph_compile_...` function, and not previously freed. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compiled_data_size(data: *const CompiledData) -> u32 { + (&*(data as *const Vec)).len() as u32 +} + +/// Get a pointer to the raw data held by a `CompiledData` object. +/// +/// # Safety +/// The `data` parameter must be a `CompiledData` pointer obtained from +/// a `mapped_hyph_compile_...` function, and not previously freed. +/// +/// The returned pointer only remains valid as long as the `CompiledData` has not +/// been released (by passing it to `mapped_hyph_free_compiled_data`). +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compiled_data_ptr(data: *const CompiledData) -> *const u8 { + (&*(data as *const Vec)).as_ptr() +} diff --git a/src/lib.rs b/src/lib.rs index 6f68da8..32ba687 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -430,7 +430,7 @@ pub struct Hyphenator<'a>(&'a [u8]); impl Hyphenator<'_> { /// Return a Hyphenator that wraps the given buffer. /// This does *not* check that the given buffer is in fact a valid hyphenation table. - /// Use is_valid_hyphenator() to determine whether it is usable. + /// Use `is_valid_hyphenator()` to determine whether it is usable. /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary, /// unvalidated data is not unsafe, but may panic.) pub fn new(buffer: &[u8]) -> Hyphenator { @@ -621,7 +621,7 @@ impl Hyphenator<'_> { /// /// # Safety /// -/// This is unsafe for the same reason Mmap::map() is unsafe: +/// This is unsafe for the same reason `Mmap::map()` is unsafe: /// mapped_hyph does not guarantee safety if the mapped file is modified /// (e.g. by another process) while we're using it. ///