diff --git a/Cargo.toml b/Cargo.toml index 6afda6f..a85ce98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "mapped_hyph" description = "Hyphenation using precompiled memory-mapped tables" -version = "0.4.0" +version = "0.4.1" authors = ["Jonathan Kew "] license = "MIT/Apache-2.0" edition = "2018" diff --git a/src/builder.rs b/src/builder.rs index 3ad4769..a1ab0c4 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -132,7 +132,10 @@ impl LevelBuilder { let mut got_digit = false; for byte in bytes { if *byte <= b'9' && *byte >= b'0' { - assert!(!got_digit, "invalid pattern \"{}\": consecutive digits", pattern); + if got_digit { + println!("invalid pattern \"{}\": consecutive digits", pattern); + return; + } digits.push(*byte); got_digit = true; } else { @@ -157,7 +160,10 @@ impl LevelBuilder { // Convert repl_index and repl_cut from Unicode char to byte indexing. let start = if text[0] == b'.' { 1 } else { 0 }; if start == 1 { - assert_eq!(digits[0], b'0', "unexpected digit before start of word"); + if digits[0] != b'0' { + println!("invalid pattern \"{}\": unexpected digit before start of word", pattern); + return; + } digits.remove(0); } let word = std::str::from_utf8(&text[start..]).unwrap(); @@ -171,7 +177,10 @@ impl LevelBuilder { // (which should not already have a match_string). let mut state_num = self.find_state_number_for(&text); let mut state = &mut self.states[state_num as usize]; - assert!(state.match_string.is_none(), "duplicate pattern?"); + if state.match_string.is_some() { + println!("duplicate pattern \"{}\" discarded", pattern); + return; + } if !digits.is_empty() { state.match_string = Some(digits); } @@ -188,7 +197,7 @@ impl LevelBuilder { text.truncate(text.len() - 1); state_num = self.find_state_number_for(&text); if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) { - assert_eq!(exists, last_state, "overwriting existing transition?"); + assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern); break; } } @@ -349,7 +358,7 @@ impl LevelBuilder { /// machine transitions, etc. /// The returned Vec can be passed to write_hyf_file() to generate a flattened /// representation of the state machine in mapped_hyph's binary format. -fn read_dic_file(dic_file: T, compress: bool) -> Vec { +fn read_dic_file(dic_file: T, compress: bool) -> Result, &'static str> { let reader = BufReader::new(dic_file); let mut builders = Vec::::new(); @@ -370,14 +379,19 @@ fn read_dic_file(dic_file: T, compress: bool) -> Vec { if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' { // First line is encoding; we only support UTF-8. if builder.encoding.is_none() { - assert_eq!(trimmed, "UTF-8", "Only UTF-8 patterns are accepted!"); + if trimmed != "UTF-8" { + return Err("Only UTF-8 patterns are accepted!"); + }; builder.encoding = Some(trimmed); continue; } // Check for valid keyword-value pairs. if trimmed.contains(' ') { let parts: Vec<&str> = trimmed.split(' ').collect(); - assert!(parts.len() == 2); + if parts.len() != 2 { + println!("unrecognized keyword/values: {}", trimmed); + continue; + } let keyword = parts[0]; let value = parts[1]; match keyword { @@ -399,8 +413,12 @@ fn read_dic_file(dic_file: T, compress: bool) -> Vec { println!("unknown keyword: {}", trimmed); continue; } - // Patterns should always be provided in lowercase; complain if not. - assert_eq!(trimmed, trimmed.to_lowercase(), "pattern \"{}\" not lowercased at line {}", trimmed, index); + // Patterns should always be provided in lowercase; complain if not, and discard + // the bad pattern. + if trimmed != trimmed.to_lowercase() { + println!("pattern \"{}\" not lowercased at line {}", trimmed, index); + continue; + } builder.add_pattern(&trimmed); } @@ -446,7 +464,7 @@ fn read_dic_file(dic_file: T, compress: bool) -> Vec { } } - builders + Ok(builders) } /// Write out the state machines representing a set of hyphenation rules @@ -481,5 +499,11 @@ fn write_hyf_file(hyf_file: &mut T, levels: Vec) -> std: /// to `hyf_file`. The `compress` param determines whether extra processing to reduce the /// size of the output is performed. pub fn compile(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> { - write_hyf_file(hyf_file, read_dic_file(dic_file, compress)) + match read_dic_file(dic_file, compress) { + Ok(dic) => write_hyf_file(hyf_file, dic), + Err(e) => { + println!("parse error: {}", e); + return Err(Error::from(ErrorKind::InvalidData)) + } + } }