From 69f6ae42f1ba81a4039367142a7635cb5cb321bf Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Mon, 5 Oct 2020 13:50:59 +0100 Subject: [PATCH] [0.4.1] Just return an error (rather than asserting) on parse errors in the .dic compiler. Now that we're potentially compiling .dic files at runtime in Firefox, we need slightly more complete error handling: either discard the bad fragment of the input, or reject the entire file, but don't assert and hence crash the entire process. --- Cargo.toml | 2 +- src/builder.rs | 46 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6afda6f..a85ce98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "mapped_hyph" description = "Hyphenation using precompiled memory-mapped tables" -version = "0.4.0" +version = "0.4.1" authors = ["Jonathan Kew "] license = "MIT/Apache-2.0" edition = "2018" diff --git a/src/builder.rs b/src/builder.rs index 3ad4769..a1ab0c4 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -132,7 +132,10 @@ impl LevelBuilder { let mut got_digit = false; for byte in bytes { if *byte <= b'9' && *byte >= b'0' { - assert!(!got_digit, "invalid pattern \"{}\": consecutive digits", pattern); + if got_digit { + println!("invalid pattern \"{}\": consecutive digits", pattern); + return; + } digits.push(*byte); got_digit = true; } else { @@ -157,7 +160,10 @@ impl LevelBuilder { // Convert repl_index and repl_cut from Unicode char to byte indexing. let start = if text[0] == b'.' { 1 } else { 0 }; if start == 1 { - assert_eq!(digits[0], b'0', "unexpected digit before start of word"); + if digits[0] != b'0' { + println!("invalid pattern \"{}\": unexpected digit before start of word", pattern); + return; + } digits.remove(0); } let word = std::str::from_utf8(&text[start..]).unwrap(); @@ -171,7 +177,10 @@ impl LevelBuilder { // (which should not already have a match_string). let mut state_num = self.find_state_number_for(&text); let mut state = &mut self.states[state_num as usize]; - assert!(state.match_string.is_none(), "duplicate pattern?"); + if state.match_string.is_some() { + println!("duplicate pattern \"{}\" discarded", pattern); + return; + } if !digits.is_empty() { state.match_string = Some(digits); } @@ -188,7 +197,7 @@ impl LevelBuilder { text.truncate(text.len() - 1); state_num = self.find_state_number_for(&text); if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) { - assert_eq!(exists, last_state, "overwriting existing transition?"); + assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern); break; } } @@ -349,7 +358,7 @@ impl LevelBuilder { /// machine transitions, etc. /// The returned Vec can be passed to write_hyf_file() to generate a flattened /// representation of the state machine in mapped_hyph's binary format. -fn read_dic_file(dic_file: T, compress: bool) -> Vec { +fn read_dic_file(dic_file: T, compress: bool) -> Result, &'static str> { let reader = BufReader::new(dic_file); let mut builders = Vec::::new(); @@ -370,14 +379,19 @@ fn read_dic_file(dic_file: T, compress: bool) -> Vec { if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' { // First line is encoding; we only support UTF-8. if builder.encoding.is_none() { - assert_eq!(trimmed, "UTF-8", "Only UTF-8 patterns are accepted!"); + if trimmed != "UTF-8" { + return Err("Only UTF-8 patterns are accepted!"); + }; builder.encoding = Some(trimmed); continue; } // Check for valid keyword-value pairs. if trimmed.contains(' ') { let parts: Vec<&str> = trimmed.split(' ').collect(); - assert!(parts.len() == 2); + if parts.len() != 2 { + println!("unrecognized keyword/values: {}", trimmed); + continue; + } let keyword = parts[0]; let value = parts[1]; match keyword { @@ -399,8 +413,12 @@ fn read_dic_file(dic_file: T, compress: bool) -> Vec { println!("unknown keyword: {}", trimmed); continue; } - // Patterns should always be provided in lowercase; complain if not. - assert_eq!(trimmed, trimmed.to_lowercase(), "pattern \"{}\" not lowercased at line {}", trimmed, index); + // Patterns should always be provided in lowercase; complain if not, and discard + // the bad pattern. + if trimmed != trimmed.to_lowercase() { + println!("pattern \"{}\" not lowercased at line {}", trimmed, index); + continue; + } builder.add_pattern(&trimmed); } @@ -446,7 +464,7 @@ fn read_dic_file(dic_file: T, compress: bool) -> Vec { } } - builders + Ok(builders) } /// Write out the state machines representing a set of hyphenation rules @@ -481,5 +499,11 @@ fn write_hyf_file(hyf_file: &mut T, levels: Vec) -> std: /// to `hyf_file`. The `compress` param determines whether extra processing to reduce the /// size of the output is performed. pub fn compile(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> { - write_hyf_file(hyf_file, read_dic_file(dic_file, compress)) + match read_dic_file(dic_file, compress) { + Ok(dic) => write_hyf_file(hyf_file, dic), + Err(e) => { + println!("parse error: {}", e); + return Err(Error::from(ErrorKind::InvalidData)) + } + } }