diff --git a/src/cfb.rs b/src/cfb.rs index a5f644e..d00877f 100644 --- a/src/cfb.rs +++ b/src/cfb.rs @@ -273,12 +273,9 @@ impl Sectors { r: &mut R, len: usize, ) -> Result, CfbError> { - let mut chain = if len > 0 { - Vec::with_capacity(len) - } else { - Vec::new() - }; + let mut chain = Vec::with_capacity(len); while sector_id != ENDOFCHAIN { + // FIXME: see tests/OOM_alloc3.xls, this call OOMs chain.extend_from_slice(self.get(sector_id, r)?); sector_id = fats[sector_id as usize]; } @@ -421,6 +418,11 @@ pub struct XlsEncoding { } impl XlsEncoding { + pub fn unicode() -> XlsEncoding { + XlsEncoding { + encoding: encoding_rs::UTF_16LE, + } + } pub fn from_codepage(codepage: u16) -> Result { let e = codepage::to_encoding(codepage).ok_or(CfbError::CodePageNotFound(codepage))?; Ok(XlsEncoding { encoding: e }) diff --git a/src/lib.rs b/src/lib.rs index 9a0e527..89d5dd5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -471,48 +471,40 @@ impl Range { /// Coordinate list (COO) is the natural way cells are stored /// Inner size is defined only by non empty. /// - /// cells: `Vec` of non empty `Cell`s, sorted by row - /// - /// # Panics - /// - /// panics when a `Cell` row is lower than the first `Cell` row or - /// bigger than the last `Cell` row. - pub fn from_sparse(cells: Vec>) -> Range { - if cells.is_empty() { - Range::empty() - } else { - // search bounds - let row_start = cells.first().unwrap().pos.0; - let row_end = cells.last().unwrap().pos.0; - let mut col_start = u32::MAX; - let mut col_end = 0; - for c in cells.iter().map(|c| c.pos.1) { - if c < col_start { - col_start = c; - } - if c > col_end { - col_end = c - } - } - let cols = (col_end - col_start + 1) as usize; - let rows = (row_end - row_start + 1) as usize; - let len = cols.saturating_mul(rows); - let mut v = vec![T::default(); len]; - v.shrink_to_fit(); - for c in cells { - let row = (c.pos.0 - row_start) as usize; - let col = (c.pos.1 - col_start) as usize; - let idx = row.saturating_mul(cols) + col; - if let Some(v) = v.get_mut(idx) { - *v = c.val; - } - } - Range { - start: (row_start, col_start), - end: (row_end, col_end), - inner: v, + /// cells: `Vec` of non empty `Cell`s + pub fn from_sparse(mut cells: Vec>) -> Range { + // cells do not always appear in (row, col) order + cells.sort_by_key(|cell| (cell.pos.0, cell.pos.1)); + let (row_start, row_end) = match &cells[..] { + [] => return Range::empty(), + [first] => (first.pos.0, first.pos.0), + [first, .., last] => (first.pos.0, last.pos.0), + }; + // search bounds + let mut col_start = u32::MAX; + let mut col_end = 0; + for c in cells.iter().map(|c| c.pos.1) { + col_start = min(col_start, c); + col_end = max(col_end, c); + } + let cols = (col_end - col_start + 1) as usize; + let rows = (row_end - row_start + 1) as usize; + let len = cols.saturating_mul(rows); + let mut v = vec![T::default(); len]; + v.shrink_to_fit(); + for c in cells { + let row = (c.pos.0 - row_start) as usize; + let col = (c.pos.1 - col_start) as usize; + let idx = row.saturating_mul(cols) + col; + if let Some(v) = v.get_mut(idx) { + *v = c.val; } } + Range { + start: (row_start, col_start), + end: (row_end, col_end), + inner: v, + } } /// Set inner value from absolute position diff --git a/src/xls.rs b/src/xls.rs index a04fcbe..6e47539 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -363,7 +363,7 @@ impl Xls { let cch = r.data[3] as usize; let cce = read_u16(&r.data[4..]) as usize; let mut name = String::new(); - read_unicode_string_no_cch(&encoding, &r.data[14..], &cch, &mut name); + read_unicode_string_no_cch(&r.data[14..], &cch, &mut name); let rgce = &r.data[r.data.len() - cce..]; let formula = parse_defined_names(rgce)?; defined_names.push((name, formula)); @@ -442,8 +442,8 @@ impl Xls { } //0x0201 => cells.push(parse_blank(r.data)?), // 513: Blank 0x0203 => cells.push(parse_number(r.data, &self.formats, self.is_1904)?), // 515: Number - 0x0204 => cells.extend(parse_label(r.data, &encoding, biff)?), // 516: Label [MS-XLS 2.4.148] - 0x0205 => cells.push(parse_bool_err(r.data)?), // 517: BoolErr + 0x0204 => cells.push(parse_label(r.data, &encoding, biff)?), // 516: Label [MS-XLS 2.4.148] + 0x0205 => cells.push(parse_bool_err(r.data)?), // 517: BoolErr 0x0207 => { // 519 String (formula value) let val = Data::String(parse_string(r.data, &encoding, biff)?); @@ -471,24 +471,27 @@ impl Xls { // it will appear in 0x0207 record coming next cells.push(Cell::new(fmla_pos, val)); } - let fmla = parse_formula( - &r.data[20..], - &fmla_sheet_names, - &defined_names, - &xtis, - &encoding, - ) - .unwrap_or_else(|e| { - debug!("{}", e); - format!( - "Unrecognised formula \ + let fmla = + parse_formula(&r.data[20..], &fmla_sheet_names, &defined_names, &xtis) + .unwrap_or_else(|e| { + debug!("{}", e); + format!( + "Unrecognised formula \ for cell ({}, {}): {:?}", - row, col, e - ) - }); + row, col, e + ) + }); formulas.push(Cell::new(fmla_pos, fmla)); } - _ => (), + // tests/high_byte_string.xls contains a record type that + // cannot be found in the "By Number" 2.3.2 table + 0x00D6 => { + let Ok(s) = parse_label(r.data, &encoding, biff) else { + continue; + }; + cells.push(s); + } + _ => {} } } let range = Range::from_sparse(cells); @@ -771,36 +774,37 @@ fn parse_short_string( } let mut s = String::with_capacity(cch); - let _ = encoding.decode_to(r.data, cch, &mut s, high_byte); + encoding.decode_to(r.data, cch, &mut s, high_byte); Ok(s) } /// XLUnicodeString [MS-XLS 2.5.294] fn parse_string(r: &[u8], encoding: &XlsEncoding, biff: Biff) -> Result { - if r.len() < 4 { + let (mut high_byte, expected) = match biff { + Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => (None, 2), + _ => (Some(false), 3), + }; + if r.len() < expected { + if 2 == r.len() && read_u16(r) == 0 { + // tests/high_byte_string.xls + return Ok(String::new()); + } return Err(XlsError::Len { typ: "string", - expected: 4, + expected, found: r.len(), }); } - let cch = read_u16(r) as usize; - - let (high_byte, start) = match biff { - Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => (None, 2), - _ => (Some(r[2] & 0x1 != 0), 3), - }; + // delay populating Some(_) variant until length checks guarantee r[2] can't crash + high_byte = high_byte.map(|_| r[2] & 0x1 != 0); + let cch = read_u16(r) as usize; let mut s = String::with_capacity(cch); - let _ = encoding.decode_to(&r[start..], cch, &mut s, high_byte); + encoding.decode_to(&r[expected..], cch, &mut s, high_byte); Ok(s) } -fn parse_label( - r: &[u8], - encoding: &XlsEncoding, - biff: Biff, -) -> Result>, XlsError> { +fn parse_label(r: &[u8], encoding: &XlsEncoding, biff: Biff) -> Result, XlsError> { if r.len() < 6 { return Err(XlsError::Len { typ: "label", @@ -811,10 +815,10 @@ fn parse_label( let row = read_u16(r); let col = read_u16(&r[2..]); let _ixfe = read_u16(&r[4..]); - Ok(Some(Cell::new( + Ok(Cell::new( (row as u32, col as u32), Data::String(parse_string(&r[6..], encoding, biff)?), - ))) + )) } fn parse_label_sst(r: &[u8], strings: &[String]) -> Result>, XlsError> { @@ -840,7 +844,7 @@ fn parse_label_sst(r: &[u8], strings: &[String]) -> Result>, X } fn parse_dimensions(r: &[u8]) -> Result { - let (rf, rl, cf, cl) = match r.len() { + let (rf, rl, mut cf, cl) = match r.len() { 10 => ( read_u16(&r[0..2]) as u32, read_u16(&r[2..4]) as u32, @@ -861,6 +865,12 @@ fn parse_dimensions(r: &[u8]) -> Result { }); } }; + // 2.5.53 ColU must be <= 0xFF, if larger, reasonable to assume + // starts at 0 + // tests/OOM_alloc2.xls + if 0xFF < cf || cl < cf { + cf = 0; + } if 1 <= rl && 1 <= cl { Ok(Dimensions { start: (rf, cf), @@ -1005,8 +1015,10 @@ fn read_dbcs( Ok(s) } -fn read_unicode_string_no_cch(encoding: &XlsEncoding, buf: &[u8], len: &usize, s: &mut String) { - encoding.decode_to(&buf[1..=*len], *len, s, Some(buf[0] & 0x1 != 0)); +fn read_unicode_string_no_cch(buf: &[u8], len: &usize, s: &mut String) -> usize { + XlsEncoding::unicode() + .decode_to(&buf[1..], *len, s, Some(buf[0] & 0x1 != 0)) + .1 } struct Record<'a> { @@ -1147,7 +1159,6 @@ fn parse_formula( sheets: &[String], names: &[(String, String)], xtis: &[Xti], - encoding: &XlsEncoding, ) -> Result { let mut stack = Vec::new(); let mut formula = String::with_capacity(rgce.len()); @@ -1266,9 +1277,9 @@ fn parse_formula( stack.push(formula.len()); formula.push('\"'); let cch = rgce[0] as usize; - read_unicode_string_no_cch(encoding, &rgce[1..], &cch, &mut formula); + let l = read_unicode_string_no_cch(&rgce[1..], &cch, &mut formula); formula.push('\"'); - rgce = &rgce[2 + cch..]; + rgce = &rgce[2 + l..]; } 0x18 => { rgce = &rgce[5..]; diff --git a/tests/OOM_alloc.xls b/tests/OOM_alloc.xls new file mode 100644 index 0000000..860c2ac Binary files /dev/null and b/tests/OOM_alloc.xls differ diff --git a/tests/OOM_alloc2.xls b/tests/OOM_alloc2.xls new file mode 100644 index 0000000..b6e8d6b Binary files /dev/null and b/tests/OOM_alloc2.xls differ diff --git a/tests/OOM_alloc3.xls b/tests/OOM_alloc3.xls new file mode 100644 index 0000000..f5482ab Binary files /dev/null and b/tests/OOM_alloc3.xls differ diff --git a/tests/high_byte_string.xls b/tests/high_byte_string.xls new file mode 100644 index 0000000..4b4ba95 Binary files /dev/null and b/tests/high_byte_string.xls differ diff --git a/tests/test.rs b/tests/test.rs index 4faf6b7..97c7de0 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -714,7 +714,7 @@ fn date_xls() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let duration = chrono::Duration::seconds(255 * 60 * 60 + 10 * 60 + 10); @@ -749,7 +749,7 @@ fn date_xls_1904() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let duration = chrono::Duration::seconds(255 * 60 * 60 + 10 * 60 + 10); @@ -784,7 +784,7 @@ fn date_xlsx() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let duration = chrono::Duration::seconds(255 * 60 * 60 + 10 * 60 + 10); @@ -819,7 +819,7 @@ fn date_xlsx_1904() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let duration = chrono::Duration::seconds(255 * 60 * 60 + 10 * 60 + 10); @@ -850,7 +850,7 @@ fn date_xlsx_iso() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); assert_eq!(range.get_value((0, 0)).unwrap().as_time(), None); assert_eq!(range.get_value((0, 0)).unwrap().as_datetime(), None); @@ -894,7 +894,7 @@ fn date_ods() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let time = chrono::NaiveTime::from_hms_opt(10, 10, 10).unwrap(); @@ -942,7 +942,7 @@ fn date_xlsb() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let duration = chrono::Duration::seconds(255 * 60 * 60 + 10 * 60 + 10); @@ -977,7 +977,7 @@ fn date_xlsb_1904() { #[cfg(feature = "dates")] { - let date = chrono::NaiveDate::from_ymd_opt(2021, 01, 01).unwrap(); + let date = chrono::NaiveDate::from_ymd_opt(2021, 1, 1).unwrap(); assert_eq!(range.get_value((0, 0)).unwrap().as_date(), Some(date)); let duration = chrono::Duration::seconds(255 * 60 * 60 + 10 * 60 + 10); @@ -2138,3 +2138,34 @@ fn test_string_ref() { // second sheet is the same with a cell reference to the first sheet range_eq!(xlsx.worksheet_range_at(1).unwrap().unwrap(), expected_range); } + +#[test] +fn test_high_byte_strings_and_unicode_strings_without_reserved_tags() { + // file contains XLUnicodeString with cch = 0 and do not have a reserved byte tag + // as well as record types that do not seem to be present in the spec + let mut xls: Xls<_> = wb("high_byte_string.xls"); + for (_name, ws) in xls.worksheets() { + for (row, _col, cell) in ws.used_cells() { + if row == 3 { + assert_eq!( + cell.as_string().unwrap(), + "Inside FERC's Gas Market Report monthly bidweek price file. " + ); + } + } + } + // FIXME: Libreoffice recognizes a REPT("O", I44) formula + let formulas = xls.worksheet_formula("Sheet1").unwrap(); + assert_eq!( + "Unrecognised formula for cell (43, 9): Unrecognized { typ: \"ptg\", val: 192 }", + formulas.get_value((43, 9)).unwrap() + ); +} + +#[test] +fn test_oom_allocation() { + let _xls: Xls<_> = wb("OOM_alloc.xls"); + let _xls: Xls<_> = wb("OOM_alloc2.xls"); + // FIXME: kills all tests with abort unless unstable set_alloc_error_hook is used + // let _xls: Xls<_> = wb("OOM_alloc3.xls"); +}