Skip to content

Commit

Permalink
[0.4.0] Features needed for runtime .dic compilation in gecko.
Browse files Browse the repository at this point in the history
Added `compress` param to `read_dic_file`, to choose between faster compilation or smaller result;
added FFI functions to compile .dic files into state tables and return as a memory buffer
for Gecko to manage further.

Simplified `builder` API to just expose a single `compile()` function instead of a two-step process.
  • Loading branch information
jfkthame committed Sep 24, 2020
1 parent 99c778f commit c7737af
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "mapped_hyph"
description = "Hyphenation using precompiled memory-mapped tables"
version = "0.3.0"
version = "0.4.0"
authors = ["Jonathan Kew <[email protected]>"]
license = "MIT/Apache-2.0"
edition = "2018"
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ comments in the source.

## C and C++ bindings

See the `mapped_hyph.h` header for C/C++ APIs that can be used to load hyphenation files
See src/ffi.rs for C/C++ APIs that can be used to load hyphenation files
and to locate valid hyphenation positions in a word.

## Sample programs
Expand All @@ -63,6 +63,15 @@ included here, as it is handy for testing purposes.)

## Release Notes

### 0.4.0

* Added a boolean `compress` param to the pattern compiler to control whether
it attempts to compress the compiled table by merging duplicate states (which
takes significant extra time).

* Added FFI functions to compile hyphenation tables from a file path or a buffer,
intended for use from Gecko.

### 0.3.0

* Switched from MPL2 to Apache2/MIT dual license.
Expand Down
4 changes: 2 additions & 2 deletions src/bin/hyf_compile.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
Expand All @@ -17,7 +17,7 @@ fn main() -> std::io::Result<()> {
if args.len() == 3 {
let in_file = File::open(&args[1])?;
let mut out_file = File::create(&args[2])?;
mapped_hyph::builder::write_hyf_file(&mut out_file, mapped_hyph::builder::read_dic_file(&in_file))?;
mapped_hyph::builder::compile(&in_file, &mut out_file, true)?;
} else {
println!("usage: hyf_compile <pattern-file> <output-file>");
}
Expand Down
32 changes: 22 additions & 10 deletions src/builder.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
Expand All @@ -10,7 +10,7 @@
/// Functions to compile human-readable patterns into a mapped_hyph
/// flattened representation of the hyphenation state machine.
use std::io::{Read,BufRead,BufReader,Write};
use std::io::{Read,BufRead,BufReader,Write,Error,ErrorKind};
use std::collections::HashMap;
use std::convert::TryInto;
use std::hash::{Hash,Hasher};
Expand Down Expand Up @@ -63,10 +63,10 @@ impl State {
}
}

/// This is only public because the read_dic_file() function returns a Vec
/// of LevelBuilder structs, which can then be passed to write_hyf_file()
/// Structures returned by the read_dic_file() function;
/// array of these can then be passed to write_hyf_file()
/// to create the flattened output.
pub struct LevelBuilder {
struct LevelBuilder {
states: Vec<State>,
str_to_state: HashMap<Vec<u8>,i32>,
encoding: Option<String>,
Expand Down Expand Up @@ -349,7 +349,7 @@ impl LevelBuilder {
/// machine transitions, etc.
/// The returned Vec can be passed to write_hyf_file() to generate a flattened
/// representation of the state machine in mapped_hyph's binary format.
pub fn read_dic_file<T: Read>(dic_file: T) -> Vec<LevelBuilder> {
fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Vec<LevelBuilder> {
let reader = BufReader::new(dic_file);

let mut builders = Vec::<LevelBuilder>::new();
Expand Down Expand Up @@ -439,17 +439,22 @@ pub fn read_dic_file<T: Read>(dic_file: T) -> Vec<LevelBuilder> {
}
}

// Merge duplicate states to reduce size.
for builder in &mut builders {
builder.merge_duplicate_states();
if compress {
// Merge duplicate states to reduce size.
for builder in &mut builders {
builder.merge_duplicate_states();
}
}

builders
}

/// Write out the state machines representing a set of hyphenation rules
/// to the given output stream.
pub fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> {
fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> {
if levels.is_empty() {
return Err(Error::from(ErrorKind::InvalidData));
}
let mut flattened = vec![];
for level in levels {
flattened.push(level.flatten());
Expand All @@ -471,3 +476,10 @@ pub fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) ->
}
Ok(())
}

/// The public API to the compilation process: reads `dic_file` and writes compiled tables
/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the
/// size of the output is performed.
pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> {
write_hyf_file(hyf_file, read_dic_file(dic_file, compress))
}
87 changes: 86 additions & 1 deletion src/ffi.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
Expand All @@ -10,6 +10,8 @@
use std::slice;
use std::str;
use std::ffi::CStr;
use std::fs::File;
use std::io::Read;
use std::os::raw::c_char;
use std::str::Utf8Error;

Expand All @@ -21,6 +23,9 @@ use super::Hyphenator;
/// for use in FFI function signatures.
pub struct HyphDic;

/// Opaque type representing a compiled dictionary in a memory buffer.
pub struct CompiledData;

// Helper to convert word and hyphen buffer parameters from raw C pointer/length
// pairs to the Rust types expected by mapped_hyph.
unsafe fn params_from_c<'a>(word: *const c_char, word_len: u32,
Expand Down Expand Up @@ -163,3 +168,83 @@ pub unsafe extern "C" fn mapped_hyph_is_valid_hyphenator(dic_buf: *const u8, dic
let dic = Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize));
dic.is_valid_hyphenator()
}

/// C-callable function to free a CompiledData object created by
/// a `mapped_hyph_compile_...` function (below).
///
/// # Safety
/// The `data` parameter must be a `CompiledData` pointer obtained from
/// a `mapped_hyph_compile_...` function, and not previously freed.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_free_compiled_data(data: *mut CompiledData) {
Box::from_raw(data);
}

// Helper for the compilation functions (from either memory buffer or file path).
fn compile_and_wrap<T: Read>(input: T, compress: bool) -> *const CompiledData {
let mut compiled: Vec<u8> = vec![];
if super::builder::compile(input, &mut compiled, compress).is_err() {
return std::ptr::null();
}
compiled.shrink_to_fit();

// Create a persistent heap reference to the compiled data, and return a pointer to it.
Box::into_raw(Box::new(compiled)) as *const CompiledData
}

/// C-callable function to compile hyphenation patterns from `pattern_buf` and return
/// the compiled data in a memory buffer, suitable to be stored somewhere or passed
/// to `mapped_hyph_find_hyphen_values_raw` to perform hyphenation.
///
/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`.
///
/// # Safety
/// The `pattern_buf` parameter must be a valid pointer to a memory block of size
/// at least `pattern_len`.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_compile_buffer(pattern_buf: *const u8, pattern_len: u32, compress: bool) -> *const CompiledData {
compile_and_wrap(slice::from_raw_parts(pattern_buf, pattern_len as usize), compress)
}

/// C-callable function to compile hyphenation patterns from a file to a memory buffer.
///
/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`.
///
/// # Safety
/// The given `path` must be a valid pointer to a NUL-terminated (C-style) string.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_compile_file(path: *const c_char, compress: bool) -> *const CompiledData {
// Try to open the file at the given path, returning null on failure.
let path_str = match CStr::from_ptr(path).to_str() {
Ok(str) => str,
Err(_) => return std::ptr::null(),
};
let in_file = match File::open(path_str) {
Ok(file) => file,
Err(_) => return std::ptr::null(),
};
compile_and_wrap(&in_file, compress)
}

/// Get the size of the compiled table buffer in a `CompiledData` object.
///
/// # Safety
/// The `data` parameter must be a `CompiledData` pointer obtained from
/// a `mapped_hyph_compile_...` function, and not previously freed.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_compiled_data_size(data: *const CompiledData) -> u32 {
(&*(data as *const Vec<u8>)).len() as u32
}

/// Get a pointer to the raw data held by a `CompiledData` object.
///
/// # Safety
/// The `data` parameter must be a `CompiledData` pointer obtained from
/// a `mapped_hyph_compile_...` function, and not previously freed.
///
/// The returned pointer only remains valid as long as the `CompiledData` has not
/// been released (by passing it to `mapped_hyph_free_compiled_data`).
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_compiled_data_ptr(data: *const CompiledData) -> *const u8 {
(&*(data as *const Vec<u8>)).as_ptr()
}
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ pub struct Hyphenator<'a>(&'a [u8]);
impl Hyphenator<'_> {
/// Return a Hyphenator that wraps the given buffer.
/// This does *not* check that the given buffer is in fact a valid hyphenation table.
/// Use is_valid_hyphenator() to determine whether it is usable.
/// Use `is_valid_hyphenator()` to determine whether it is usable.
/// (Calling hyphenation methods on a Hyphenator that wraps arbitrary,
/// unvalidated data is not unsafe, but may panic.)
pub fn new(buffer: &[u8]) -> Hyphenator {
Expand Down Expand Up @@ -621,7 +621,7 @@ impl Hyphenator<'_> {
///
/// # Safety
///
/// This is unsafe for the same reason Mmap::map() is unsafe:
/// This is unsafe for the same reason `Mmap::map()` is unsafe:
/// mapped_hyph does not guarantee safety if the mapped file is modified
/// (e.g. by another process) while we're using it.
///
Expand Down

0 comments on commit c7737af

Please sign in to comment.