From 0c87903d35f03d05f6dfa833afc6cde228b00885 Mon Sep 17 00:00:00 2001 From: Johann Tuffe Date: Wed, 1 Nov 2023 22:19:56 +0800 Subject: [PATCH] add formula reading for xlsb cells_reader --- src/auto.rs | 10 +- src/lib.rs | 3 +- src/ods.rs | 7 +- src/xls.rs | 7 +- src/xlsb/cells_reader.rs | 54 ++++++++- src/xlsb/mod.rs | 250 ++++----------------------------------- src/xlsx/mod.rs | 19 +-- tests/test.rs | 18 +-- 8 files changed, 110 insertions(+), 258 deletions(-) diff --git a/src/auto.rs b/src/auto.rs index f886a4d1..61f14d60 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -115,12 +115,12 @@ where } /// Read worksheet formula in corresponding worksheet path - fn worksheet_formula(&mut self, name: &str) -> Option, Self::Error>> { + fn worksheet_formula(&mut self, name: &str) -> Result, Self::Error> { match *self { - Sheets::Xls(ref mut e) => e.worksheet_formula(name).map(|r| r.map_err(Error::Xls)), - Sheets::Xlsx(ref mut e) => e.worksheet_formula(name).map(|r| r.map_err(Error::Xlsx)), - Sheets::Xlsb(ref mut e) => e.worksheet_formula(name).map(|r| r.map_err(Error::Xlsb)), - Sheets::Ods(ref mut e) => e.worksheet_formula(name).map(|r| r.map_err(Error::Ods)), + Sheets::Xls(ref mut e) => e.worksheet_formula(name).map_err(Error::Xls), + Sheets::Xlsx(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsx), + Sheets::Xlsb(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsb), + Sheets::Ods(ref mut e) => e.worksheet_formula(name).map_err(Error::Ods), } } diff --git a/src/lib.rs b/src/lib.rs index dbd0d79b..6ab55fcf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,7 +49,6 @@ //! println!("found {} formula in '{}'", //! workbook //! .worksheet_formula(&s) -//! .expect("sheet not found") //! .expect("error while getting formula") //! .rows().flat_map(|r| r.iter().filter(|f| !f.is_empty())) //! .count(), @@ -230,7 +229,7 @@ where fn worksheets(&mut self) -> Vec<(String, Range)>; /// Read worksheet formula in corresponding worksheet path - fn worksheet_formula(&mut self, _: &str) -> Option, Self::Error>>; + fn worksheet_formula(&mut self, _: &str) -> Result, Self::Error>; /// Get all sheet names of this workbook, in workbook order /// diff --git a/src/ods.rs b/src/ods.rs index 5be473be..cefe9626 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -185,8 +185,11 @@ where } /// Read worksheet data in corresponding worksheet path - fn worksheet_formula(&mut self, name: &str) -> Option, OdsError>> { - self.sheets.get(name).map(|r| Ok(r.1.to_owned())) + fn worksheet_formula(&mut self, name: &str) -> Result, OdsError> { + self.sheets + .get(name) + .ok_or_else(|| OdsError::WorksheetNotFound(name.into())) + .map(|r| r.1.to_owned()) } #[cfg(feature = "picture")] diff --git a/src/xls.rs b/src/xls.rs index 6f8e599a..34c619eb 100644 --- a/src/xls.rs +++ b/src/xls.rs @@ -236,8 +236,11 @@ impl Reader for Xls { .collect() } - fn worksheet_formula(&mut self, name: &str) -> Option, XlsError>> { - self.sheets.get(name).map(|r| Ok(r.1.clone())) + fn worksheet_formula(&mut self, name: &str) -> Result, XlsError> { + self.sheets + .get(name) + .ok_or_else(|| XlsError::WorksheetNotFound(name.into())) + .map(|r| r.1.clone()) } #[cfg(feature = "picture")] diff --git a/src/xlsb/cells_reader.rs b/src/xlsb/cells_reader.rs index 2961f9a9..474a0738 100644 --- a/src/xlsb/cells_reader.rs +++ b/src/xlsb/cells_reader.rs @@ -5,13 +5,15 @@ use crate::{ Cell, CellErrorType, Dimensions, XlsbError, }; -use super::{cell_format, wide_str, RecordIter}; +use super::{cell_format, parse_formula, wide_str, RecordIter}; /// A cells reader for xlsb files pub struct XlsbCellsReader<'a> { iter: RecordIter<'a>, formats: &'a [CellFormat], strings: &'a [String], + extern_sheets: &'a [String], + metadata_names: &'a [(String, String)], typ: u16, row: u32, is_1904: bool, @@ -24,6 +26,8 @@ impl<'a> XlsbCellsReader<'a> { mut iter: RecordIter<'a>, formats: &'a [CellFormat], strings: &'a [String], + extern_sheets: &'a [String], + metadata_names: &'a [(String, String)], is_1904: bool, ) -> Result { let mut buf = Vec::with_capacity(1024); @@ -55,6 +59,8 @@ impl<'a> XlsbCellsReader<'a> { formats, is_1904, strings, + extern_sheets, + metadata_names, dimensions, typ: 0, row: 0, @@ -144,6 +150,52 @@ impl<'a> XlsbCellsReader<'a> { let col = read_u32(&self.buf); Ok(Some(Cell::new((self.row, col), value))) } + + pub fn next_formula(&mut self) -> Result>, XlsbError> { + let value = loop { + self.typ = self.iter.read_type()?; + let _ = self.iter.fill_buffer(&mut self.buf)?; + + let value = match self.typ { + // 0x0001 => continue, // DataType::Empty, // BrtCellBlank + 0x0008 => { + // BrtFmlaString + let cch = read_u32(&self.buf[8..]) as usize; + let formula = &self.buf[14 + cch * 2..]; + let cce = read_u32(formula) as usize; + let rgce = &formula[4..4 + cce]; + parse_formula(rgce, &self.extern_sheets, &self.metadata_names)? + } + 0x0009 => { + // BrtFmlaNum + let formula = &self.buf[18..]; + let cce = read_u32(formula) as usize; + let rgce = &formula[4..4 + cce]; + parse_formula(rgce, &self.extern_sheets, &self.metadata_names)? + } + 0x000A | 0x000B => { + // BrtFmlaBool | BrtFmlaError + let formula = &self.buf[11..]; + let cce = read_u32(formula) as usize; + let rgce = &formula[4..4 + cce]; + parse_formula(rgce, &self.extern_sheets, &self.metadata_names)? + } + 0x0000 => { + // BrtRowHdr + self.row = read_u32(&self.buf); + if self.row > 0x0010_0000 { + return Ok(None); // invalid row + } + continue; + } + 0x0092 => return Ok(None), // BrtEndSheetData + _ => continue, // anything else, ignore and try next, without changing idx + }; + break value; + }; + let col = read_u32(&self.buf); + Ok(Some(Cell::new((self.row, col), value))) + } } fn parse_dimensions(buf: &[u8]) -> Dimensions { diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index 3412f612..c94a1079 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -18,14 +18,10 @@ use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::datatype::DataTypeRef; -use crate::formats::{ - builtin_format_by_code, detect_custom_number_format, format_excel_f64, CellFormat, -}; +use crate::formats::{builtin_format_by_code, detect_custom_number_format, CellFormat}; use crate::utils::{push_column, read_f64, read_i32, read_u16, read_u32, read_usize}; use crate::vba::VbaProject; -use crate::{ - Cell, CellErrorType, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible, -}; +use crate::{Cell, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible}; /// A Xlsb specific error #[derive(Debug)] @@ -396,208 +392,14 @@ impl Xlsb { None => return Err(XlsbError::WorksheetNotFound(name.into())), }; let iter = RecordIter::from_zip(&mut self.zip, &path)?; - XlsbCellsReader::new(iter, &self.formats, &self.strings, self.is_1904) - } - - fn worksheet_range_from_path(&mut self, path: &str) -> Result, XlsbError> { - let mut iter = RecordIter::from_zip(&mut self.zip, &path)?; - let mut buf = Vec::with_capacity(1024); - let formats = &self.formats; - // BrtWsDim - let _ = iter.next_skip_blocks( - 0x0094, - &[ - (0x0081, None), // BrtBeginSheet - (0x0093, None), // BrtWsProp - ], - &mut buf, - )?; - let (start, end) = parse_dimensions(&buf[..16]); - let len = (end.0 - start.0 + 1) * (end.1 - start.1 + 1); - let mut cells = if len < 1_000_000 { - Vec::with_capacity(len as usize) - } else { - Vec::new() - }; - - // BrtBeginSheetData - let _ = iter.next_skip_blocks( - 0x0091, - &[ - (0x0085, Some(0x0086)), // Views - (0x0025, Some(0x0026)), // AC blocks - (0x01E5, None), // BrtWsFmtInfo - (0x0186, Some(0x0187)), // Col Infos - ], - &mut buf, - )?; - - // Initialization: first BrtRowHdr - let mut typ: u16; - let mut row = 0u32; - - // loop until end of sheet - loop { - typ = iter.read_type()?; - let _ = iter.fill_buffer(&mut buf)?; - - let value = match typ { - // 0x0001 => continue, // DataType::Empty, // BrtCellBlank - 0x0002 => { - // BrtCellRk MS-XLSB 2.5.122 - let d100 = (buf[8] & 1) != 0; - let is_int = (buf[8] & 2) != 0; - buf[8] &= 0xFC; - - if is_int { - let v = (read_i32(&buf[8..12]) >> 2) as i64; - if d100 { - let v = (v as f64) / 100.0; - format_excel_f64(v, cell_format(formats, &buf), self.is_1904) - } else { - DataType::Int(v) - } - } else { - let mut v = [0u8; 8]; - v[4..].copy_from_slice(&buf[8..12]); - let v = read_f64(&v); - let v = if d100 { v / 100.0 } else { v }; - format_excel_f64(v, cell_format(formats, &buf), self.is_1904) - } - } - 0x0003 => { - let error = match buf[8] { - 0x00 => CellErrorType::Null, - 0x07 => CellErrorType::Div0, - 0x0F => CellErrorType::Value, - 0x17 => CellErrorType::Ref, - 0x1D => CellErrorType::Name, - 0x24 => CellErrorType::Num, - 0x2A => CellErrorType::NA, - 0x2B => CellErrorType::GettingData, - c => return Err(XlsbError::CellError(c)), - }; - // BrtCellError - DataType::Error(error) - } - 0x0004 | 0x000A => DataType::Bool(buf[8] != 0), // BrtCellBool or BrtFmlaBool - 0x0005 | 0x0009 => { - let v = read_f64(&buf[8..16]); - format_excel_f64(v, cell_format(formats, &buf), self.is_1904) - } // BrtCellReal or BrtFmlaNum - 0x0006 | 0x0008 => DataType::String(wide_str(&buf[8..], &mut 0)?.into_owned()), // BrtCellSt or BrtFmlaString - 0x0007 => { - // BrtCellIsst - let isst = read_usize(&buf[8..12]); - DataType::String(self.strings[isst].clone()) - } - 0x0000 => { - // BrtRowHdr - row = read_u32(&buf); - if row > 0x0010_0000 { - return Ok(Range::from_sparse(cells)); // invalid row - } - continue; - } - 0x0092 => return Ok(Range::from_sparse(cells)), // BrtEndSheetData - _ => continue, // anything else, ignore and try next, without changing idx - }; - - let col = read_u32(&buf); - match value { - DataType::Empty => (), - DataType::String(s) if s.is_empty() => (), - value => cells.push(Cell::new((row, col), value)), - } - } - } - - fn worksheet_formula_from_path(&mut self, path: String) -> Result, XlsbError> { - let mut iter = RecordIter::from_zip(&mut self.zip, &path)?; - let mut buf = Vec::with_capacity(1024); - - // BrtWsDim - let _ = iter.next_skip_blocks( - 0x0094, - &[ - (0x0081, None), // BrtBeginSheet - (0x0093, None), // BrtWsProp - ], - &mut buf, - )?; - let (start, end) = parse_dimensions(&buf[..16]); - let mut cells = Vec::new(); - if start.0 <= end.0 && start.1 <= end.1 { - let rows = (end.0 - start.0 + 1) as usize; - let cols = (end.1 - start.1 + 1) as usize; - let len = rows.saturating_mul(cols); - if len < 1_000_000 { - cells.reserve(len); - } - } - - // BrtBeginSheetData - let _ = iter.next_skip_blocks( - 0x0091, - &[ - (0x0085, Some(0x0086)), // Views - (0x0025, Some(0x0026)), // AC blocks - (0x01E5, None), // BrtWsFmtInfo - (0x0186, Some(0x0187)), // Col Infos - ], - &mut buf, - )?; - - // Initialization: first BrtRowHdr - let mut typ: u16; - let mut row = 0u32; - - // loop until end of sheet - loop { - typ = iter.read_type()?; - let _ = iter.fill_buffer(&mut buf)?; - - let value = match typ { - // 0x0001 => continue, // DataType::Empty, // BrtCellBlank - 0x0008 => { - // BrtFmlaString - let cch = read_u32(&buf[8..]) as usize; - let formula = &buf[14 + cch * 2..]; - let cce = read_u32(formula) as usize; - let rgce = &formula[4..4 + cce]; - parse_formula(rgce, &self.extern_sheets, &self.metadata.names)? - } - 0x0009 => { - // BrtFmlaNum - let formula = &buf[18..]; - let cce = read_u32(formula) as usize; - let rgce = &formula[4..4 + cce]; - parse_formula(rgce, &self.extern_sheets, &self.metadata.names)? - } - 0x000A | 0x000B => { - // BrtFmlaBool | BrtFmlaError - let formula = &buf[11..]; - let cce = read_u32(formula) as usize; - let rgce = &formula[4..4 + cce]; - parse_formula(rgce, &self.extern_sheets, &self.metadata.names)? - } - 0x0000 => { - // BrtRowHdr - row = read_u32(&buf); - if row > 0x0010_0000 { - return Ok(Range::from_sparse(cells)); // invalid row - } - continue; - } - 0x0092 => return Ok(Range::from_sparse(cells)), // BrtEndSheetData - _ => continue, // anything else, ignore and try next, without changing idx - }; - - let col = read_u32(&buf); - if !value.is_empty() { - cells.push(Cell::new((row, col), value)); - } - } + XlsbCellsReader::new( + iter, + &self.formats, + &self.strings, + &self.extern_sheets, + &self.metadata.names, + self.is_1904, + ) } #[cfg(feature = "picture")] @@ -680,21 +482,28 @@ impl Reader for Xlsb { } /// MS-XLSB 2.1.7.62 - fn worksheet_formula(&mut self, name: &str) -> Option, XlsbError>> { - let path = match self.sheets.iter().find(|&(n, _)| n == name) { - Some((_, path)) => path.clone(), - None => return None, - }; - Some(self.worksheet_formula_from_path(path)) + fn worksheet_formula(&mut self, name: &str) -> Result, XlsbError> { + let mut cells_reader = self.worksheet_cells_reader(name)?; + let mut cells = Vec::with_capacity(cells_reader.dimensions().len().min(1_000_000) as _); + while let Some(cell) = cells_reader.next_formula()? { + if !cell.val.is_empty() { + cells.push(cell); + } + } + Ok(Range::from_sparse(cells)) } /// MS-XLSB 2.1.7.62 fn worksheets(&mut self) -> Vec<(String, Range)> { - let sheets = self.sheets.clone(); + let sheets = self + .sheets + .iter() + .map(|(name, _)| name.clone()) + .collect::>(); sheets .into_iter() - .filter_map(|(name, path)| { - let ws = self.worksheet_range_from_path(&path).ok()?; + .filter_map(|name| { + let ws = self.worksheet_range(&name).ok()?; Some((name, ws)) }) .collect() @@ -796,13 +605,6 @@ fn wide_str<'a>(buf: &'a [u8], str_len: &mut usize) -> Result, Xlsb Ok(UTF_16LE.decode(s).0) } -fn parse_dimensions(buf: &[u8]) -> ((u32, u32), (u32, u32)) { - ( - (read_u32(&buf[0..4]), read_u32(&buf[8..12])), - (read_u32(&buf[4..8]), read_u32(&buf[12..16])), - ) -} - /// Formula parsing /// /// [MS-XLSB 2.2.2] diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index a5fe212d..b193beee 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -810,26 +810,19 @@ impl Reader for Xlsx { }) } - fn worksheet_formula(&mut self, name: &str) -> Option, XlsxError>> { - let mut cell_reader = match self.worksheet_cells_reader(name) { - Ok(reader) => reader, - Err(XlsxError::WorksheetNotFound(_)) => return None, - Err(e) => return Some(Err(e)), - }; + fn worksheet_formula(&mut self, name: &str) -> Result, XlsxError> { + let mut cell_reader = self.worksheet_cells_reader(name)?; let len = cell_reader.dimensions().len(); let mut cells = Vec::new(); if len < 100_000 { cells.reserve(len as usize); } - loop { - match cell_reader.next_formula() { - // Ok(Some(cell)) if cell.get_value().is_empty() => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Some(Err(e)), + while let Some(cell) = cell_reader.next_formula()? { + if !cell.val.is_empty() { + cells.push(cell); } } - Some(Ok(Range::from_sparse(cells))) + Ok(Range::from_sparse(cells)) } fn worksheets(&mut self) -> Vec<(String, Range)> { diff --git a/tests/test.rs b/tests/test.rs index 7c83b15c..7c6f6b68 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -524,10 +524,10 @@ fn formula_xlsx() { let sheets = excel.sheet_names().to_owned(); for s in sheets { - let _ = excel.worksheet_formula(&s).unwrap().unwrap(); + let _ = excel.worksheet_formula(&s).unwrap(); } - let formula = excel.worksheet_formula("Sheet1").unwrap().unwrap(); + let formula = excel.worksheet_formula("Sheet1").unwrap(); range_eq!(formula, [["B1+OneRange".to_string()]]); } @@ -540,10 +540,10 @@ fn formula_xlsb() { let sheets = excel.sheet_names().to_owned(); for s in sheets { - let _ = excel.worksheet_formula(&s).unwrap().unwrap(); + let _ = excel.worksheet_formula(&s).unwrap(); } - let formula = excel.worksheet_formula("Sheet1").unwrap().unwrap(); + let formula = excel.worksheet_formula("Sheet1").unwrap(); range_eq!(formula, [["B1+OneRange".to_string()]]); } @@ -590,10 +590,10 @@ fn formula_xls() { let sheets = excel.sheet_names().to_owned(); for s in sheets { - let _ = excel.worksheet_formula(&s).unwrap().unwrap(); + let _ = excel.worksheet_formula(&s).unwrap(); } - let formula = excel.worksheet_formula("Sheet1").unwrap().unwrap(); + let formula = excel.worksheet_formula("Sheet1").unwrap(); range_eq!(formula, [["B1+OneRange".to_string()]]); } @@ -605,10 +605,10 @@ fn formula_ods() { let mut excel: Ods<_> = open_workbook(&path).unwrap(); for s in excel.sheet_names().to_owned() { - let _ = excel.worksheet_formula(&s).unwrap().unwrap(); + let _ = excel.worksheet_formula(&s).unwrap(); } - let formula = excel.worksheet_formula("Sheet1").unwrap().unwrap(); + let formula = excel.worksheet_formula("Sheet1").unwrap(); range_eq!(formula, [["of:=[.B1]+$$OneRange".to_string()]]); } @@ -1285,7 +1285,7 @@ fn issue304_xls_formula() { setup(); let path = format!("{}/tests/xls_formula.xls", env!("CARGO_MANIFEST_DIR")); let mut wb: Xls<_> = open_workbook(&path).unwrap(); - let formula = wb.worksheet_formula("Sheet1").unwrap().unwrap(); + let formula = wb.worksheet_formula("Sheet1").unwrap(); let mut rows = formula.rows(); assert_eq!(rows.next(), Some(&["A1*2".to_owned()][..])); assert_eq!(rows.next(), Some(&["2*Sheet2!A1".to_owned()][..]));