From 8e6e70651028d722544d2599adbf6801b419c1a6 Mon Sep 17 00:00:00 2001 From: Vladimir Stolyarov Date: Fri, 6 Aug 2021 18:44:34 +0300 Subject: [PATCH] Improve parsing speed by manual implementing xml.Unmarshaller for some types. (#34) * Improve parsing speed by manual implementing xml.Unmarshaller for some types. Implementation made for types representing row and shared string. I've got around 25% speed improvement. Approaches taken from https://stackoverflow.com/a/61858457 * Fix up error messages, make private stuff, etc * Move file to remove 'util' from name - too generic! Co-authored-by: Douglas Parsons --- file.go | 20 +++-- file_test.go | 15 ++-- go.mod | 2 + rows.go | 164 ++++++++++++++++++++++++++++++++++++++--- rows_test.go | 12 +-- shared_strings.go | 77 ++++++++++++++++++- shared_strings_test.go | 2 +- sheets.go | 6 +- styles.go | 2 +- xml.go | 17 +++++ 10 files changed, 274 insertions(+), 43 deletions(-) create mode 100644 xml.go diff --git a/file.go b/file.go index ff8566c..98325ed 100644 --- a/file.go +++ b/file.go @@ -11,7 +11,6 @@ import ( type XlsxFile struct { Sheets []string - zipReader *zip.Reader sheetFiles map[string]*zip.File sharedStrings []string dateStyles map[int]bool @@ -32,7 +31,7 @@ func getFileForName(files []*zip.File, name string) (*zip.File, error) { } } - return nil, fmt.Errorf("File not found: %s", name) + return nil, fmt.Errorf("file not found: %s", name) } // readFile opens and reads the entire contents of a *zip.File into memory. @@ -70,15 +69,14 @@ func OpenFile(filename string) (*XlsxFileCloser, error) { return nil, err } - x := new(XlsxFile) - + x := XlsxFile{} if err := x.init(&zipFile.Reader); err != nil { zipFile.Close() return nil, err } return &XlsxFileCloser{ - XlsxFile: *x, + XlsxFile: x, zipReadCloser: zipFile, }, nil } @@ -88,7 +86,7 @@ func OpenFile(filename string) (*XlsxFileCloser, error) { // is returned. // Note that the file must be Close()-d when you are finished with it. func OpenReaderZip(rc *zip.ReadCloser) (*XlsxFileCloser, error) { - x := new(XlsxFile) + x := XlsxFile{} if err := x.init(&rc.Reader); err != nil { rc.Close() @@ -96,7 +94,7 @@ func OpenReaderZip(rc *zip.ReadCloser) (*XlsxFileCloser, error) { } return &XlsxFileCloser{ - XlsxFile: *x, + XlsxFile: x, zipReadCloser: rc, }, nil } @@ -110,26 +108,26 @@ func NewReader(xlsxBytes []byte) (*XlsxFile, error) { return nil, err } - x := new(XlsxFile) + x := XlsxFile{} err = x.init(r) if err != nil { return nil, err } - return x, nil + return &x, nil } // NewReaderZip takes zip reader of Xlsx file and returns a populated XlsxFile struct for it. // If the file cannot be found, or key parts of the files contents are missing, an error // is returned. func NewReaderZip(r *zip.Reader) (*XlsxFile, error) { - x := new(XlsxFile) + x := XlsxFile{} if err := x.init(r); err != nil { return nil, err } - return x, nil + return &x, nil } func (x *XlsxFile) init(zipReader *zip.Reader) error { diff --git a/file_test.go b/file_test.go index 771312b..7ac2a0f 100644 --- a/file_test.go +++ b/file_test.go @@ -28,28 +28,26 @@ func TestGettingFileByNameFailure(t *testing.T) { _, err := getFileForName(zipFiles, "OOPS") - require.EqualError(t, err, "File not found: OOPS") + require.EqualError(t, err, "file not found: OOPS") } func TestOpeningMissingFile(t *testing.T) { - f, err := OpenFile("this_doesnt_exist.zip") - defer f.Close() - + _, err := OpenFile("this_doesnt_exist.zip") require.EqualError(t, err, "open this_doesnt_exist.zip: no such file or directory") } func TestHandlingSpuriousWorkbookLinks(t *testing.T) { f, err := OpenFile("./test/test-xl-relationship-prefix.xlsx") - defer f.Close() require.NoError(t, err) + defer f.Close() } func TestOpeningXlsxFile(t *testing.T) { f, err := OpenFile("./test/test-small.xlsx") + require.NoError(t, err) defer f.Close() - require.NoError(t, err) require.Equal(t, []string{"datarefinery_groundtruth_400000"}, f.Sheets) } @@ -59,9 +57,9 @@ func TestOpeningZipReadCloser(t *testing.T) { defer zrc.Close() f, err := OpenReaderZip(zrc) + require.NoError(t, err) defer f.Close() - require.NoError(t, err) require.Equal(t, []string{"datarefinery_groundtruth_400000"}, f.Sheets) } @@ -73,7 +71,8 @@ func TestClosingFile(t *testing.T) { } func TestNewReaderFromXlsxBytes(t *testing.T) { - f, _ := os.Open("./test/test-small.xlsx") + f, err := os.Open("./test/test-small.xlsx") + require.NoError(t, err) defer f.Close() b, _ := ioutil.ReadAll(f) diff --git a/go.mod b/go.mod index e1c08ab..abd2786 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/thedatashed/xlsxreader +go 1.15 + require github.com/stretchr/testify v1.3.0 diff --git a/rows.go b/rows.go index a4c3368..f7a3ad7 100644 --- a/rows.go +++ b/rows.go @@ -13,6 +13,51 @@ type rawRow struct { RawCells []rawCell `xml:"c"` } +func (rr *rawRow) unmarshalXML(d *xml.Decoder, start xml.StartElement) error { + for _, attr := range start.Attr { + if attr.Name.Local != "r" { + continue + } + + var err error + + if rr.Index, err = strconv.Atoi(attr.Value); err != nil { + return err + } + } + + for { + tok, err := d.Token() + if err != nil { + return err + } + + var se xml.StartElement + + switch el := tok.(type) { + case xml.StartElement: + se = el + case xml.EndElement: + if el == start.End() { + return nil + } + default: + continue + } + + if se.Name.Local != "c" { + continue + } + + var rc rawCell + if err = rc.unmarshalXML(d, se); err != nil { + return err + } + + rr.RawCells = append(rr.RawCells, rc) + } +} + // rawCell represents the raw XML element for parsing a cell. type rawCell struct { Reference string `xml:"r,attr"` // E.g. A1 @@ -22,6 +67,99 @@ type rawCell struct { InlineString *string `xml:"is>t"` } +func (rc *rawCell) unmarshalXML(d *xml.Decoder, start xml.StartElement) error { + // unmarshal attributes + for _, attr := range start.Attr { + switch attr.Name.Local { + case "r": + rc.Reference = attr.Value + case "t": + rc.Type = attr.Value + case "s": + var err error + + if rc.Style, err = strconv.Atoi(attr.Value); err != nil { + return err + } + } + } + + for { + tok, err := d.Token() + if err != nil { + return err + } + + var se xml.StartElement + + switch el := tok.(type) { + case xml.StartElement: + se = el + case xml.EndElement: + if el == start.End() { + return nil + } + continue + default: + continue + } + + switch se.Name.Local { + case "is": + err = rc.unmarshalInlineString(d, se) + case "v": + var v string + + if v, err = getCharData(d); err != nil { + return err + } + + rc.Value = &v + default: + continue + } + + if err != nil { + return err + } + } +} + +func (rc *rawCell) unmarshalInlineString(d *xml.Decoder, start xml.StartElement) error { + for { + tok, err := d.Token() + if err != nil { + return err + } + + var se xml.StartElement + + switch el := tok.(type) { + case xml.StartElement: + se = el + case xml.EndElement: + if el == start.End() { + return nil + } + continue + default: + continue + } + + if se.Name.Local != "t" { + continue + } + + v, err := getCharData(d) + if err != nil { + return err + } + + rc.InlineString = &v + return nil + } +} + // Row represents a row of data read from an Xlsx file, in a consumable format type Row struct { Error error @@ -37,15 +175,21 @@ type Cell struct { Type CellType } +// CellType defines the data type of an excel cell type CellType string const ( - TypeString CellType = "string" + // TypeString is for text cells + TypeString CellType = "string" + // TypeNumerical is for numerical values TypeNumerical CellType = "numerical" - TypeDateTime CellType = "datetime" - TypeBoolean CellType = "boolean" + // TypeDateTime is for date values + TypeDateTime CellType = "datetime" + // TypeBoolean is for true/false values + TypeBoolean CellType = "boolean" ) +// ColumnIndex gives a number, representing the column the cell lies beneath. func (c Cell) ColumnIndex() int { return asIndex(c.Column) } @@ -57,13 +201,13 @@ func (c Cell) ColumnIndex() int { func (x *XlsxFile) getCellValue(r rawCell) (string, error) { if r.Type == "inlineStr" { if r.InlineString == nil { - return "", fmt.Errorf("Cell had type of InlineString, but the InlineString attribute was missing") + return "", fmt.Errorf("cell had type of InlineString, but the InlineString attribute was missing") } return *r.InlineString, nil } if r.Value == nil { - return "", fmt.Errorf("Unable to get cell value for cell %s - no value element found", r.Reference) + return "", fmt.Errorf("unable to get cell value for cell %s - no value element found", r.Reference) } if r.Type == "s" { @@ -72,7 +216,7 @@ func (x *XlsxFile) getCellValue(r rawCell) (string, error) { return "", err } if len(x.sharedStrings) <= index { - return "", fmt.Errorf("Attempted to index value %d in shared strings of length %d", + return "", fmt.Errorf("attempted to index value %d in shared strings of length %d", index, len(x.sharedStrings)) } @@ -118,7 +262,7 @@ func (x *XlsxFile) readSheetRows(sheet string, ch chan<- Row) { file, ok := x.sheetFiles[sheet] if !ok { ch <- Row{ - Error: fmt.Errorf("Unable to open sheet %s", sheet), + Error: fmt.Errorf("unable to open sheet %s", sheet), } return } @@ -157,8 +301,8 @@ func (x *XlsxFile) readSheetRows(sheet string, ch chan<- Row) { // The Row struct returned will contain any errors that occurred either in // interrogating values, or in parsing the XML. func (x *XlsxFile) parseRow(decoder *xml.Decoder, startElement *xml.StartElement) Row { - r := rawRow{} - err := decoder.DecodeElement(&r, startElement) + var r rawRow + err := r.unmarshalXML(decoder, *startElement) if err != nil { return Row{ Error: err, @@ -242,7 +386,7 @@ func removeNonAlpha(r rune) rune { // cell name to cell index. 'A' -> 0, 'Z' -> 25, 'AA' -> 26 func asIndex(s string) int { index := 0 - for _, c := range []rune(s) { + for _, c := range s { index *= 26 index += int(c) - 'A' + 1 } diff --git a/rows_test.go b/rows_test.go index cbf9223..b4017cd 100644 --- a/rows_test.go +++ b/rows_test.go @@ -35,7 +35,7 @@ var cellValueTests = []struct { { Name: "Invalid Inline String", Cell: rawCell{Type: "inlineStr", InlineString: nil}, - Error: "Cell had type of InlineString, but the InlineString attribute was missing", + Error: "cell had type of InlineString, but the InlineString attribute was missing", }, { Name: "Valid Date", @@ -70,7 +70,7 @@ var cellValueTests = []struct { { Name: "Invalid (invalid offset) Shared String", Cell: rawCell{Type: "s", Value: &offsetTooHighSharedString}, - Error: "Attempted to index value 32 in shared strings of length 4", + Error: "attempted to index value 32 in shared strings of length 4", }, { Name: "Unknown type", @@ -90,7 +90,7 @@ var cellValueTests = []struct { { Name: "No Inline String or Value", Cell: rawCell{Type: "s", Reference: "C23"}, - Error: "Unable to get cell value for cell C23 - no value element found", + Error: "unable to get cell value for cell C23 - no value element found", }, } @@ -169,8 +169,8 @@ var readSheetRowsTests = []struct { SheetName string Error string }{ - {"worksheetOne", "Unable to open sheet worksheetOne"}, - {"NonExistent", "Unable to open sheet NonExistent"}, + {"worksheetOne", "unable to open sheet worksheetOne"}, + {"NonExistent", "unable to open sheet NonExistent"}, } func TestReadSheetRows(t *testing.T) { @@ -231,7 +231,7 @@ var parseRawCellsTests = []struct { }{ { Name: "Invalid Cell", - Error: "Cell had type of InlineString, but the InlineString attribute was missing", + Error: "cell had type of InlineString, but the InlineString attribute was missing", RawCells: []rawCell{ {Type: "inlineStr", InlineString: nil, Value: &inlineStr}, }, diff --git a/shared_strings.go b/shared_strings.go index cf4d121..05822a7 100644 --- a/shared_strings.go +++ b/shared_strings.go @@ -15,6 +15,77 @@ type sharedStringsValue struct { RichText []string `xml:"r>t"` } +func (sv *sharedStringsValue) unmarshalXML(d *xml.Decoder, start xml.StartElement) error { + for { + tok, err := d.Token() + if err != nil { + return err + } + + var se xml.StartElement + + switch el := tok.(type) { + case xml.StartElement: + se = el + case xml.EndElement: + if el == start.End() { + return nil + } + continue + default: + continue + } + + switch se.Name.Local { + case "t": + sv.Text, err = getCharData(d) + case "r": + err = sv.decodeRichText(d, se) + default: + continue + } + + if err != nil { + return err + } + } +} + +func (sv *sharedStringsValue) decodeRichText(d *xml.Decoder, start xml.StartElement) error { + for { + tok, err := d.Token() + if err != nil { + return err + } + + var se xml.StartElement + + switch el := tok.(type) { + case xml.StartElement: + se = el + case xml.EndElement: + if el == start.End() { + return nil + } + continue + default: + continue + } + + if se.Name.Local != "t" { + continue + } + + var s string + + if s, err = getCharData(d); err != nil { + return err + } + + sv.RichText = append(sv.RichText, s) + } +} + // String gets a string value from the raw sharedStringsValue struct. // Since the values can appear in many different places in the xml structure, we need to normalise this. // They can either be: @@ -42,7 +113,7 @@ func (sv *sharedStringsValue) Reset() { } // Sentinel error to indicate that no shared strings file can be found -var errNoSharedStrings = errors.New("No shared strings file exists") +var errNoSharedStrings = errors.New("no shared strings file exists") // getSharedStringsFile attempts to find and return the zip.File struct associated with the // shared strings section of an xlsx file. An error is returned if the sharedStrings file @@ -103,7 +174,7 @@ func getSharedStrings(files []*zip.File) ([]string, error) { } value.Reset() - if err := dec.DecodeElement(&value, &startElement); err != nil { + if err := value.unmarshalXML(dec, startElement); err != nil { return nil, err } @@ -124,7 +195,7 @@ func makeSharedStringsSlice(rootElem xml.StartElement) []string { count, err = strconv.Atoi(attr.Value) if err != nil { - return make([]string, 0) + return []string{} } } diff --git a/shared_strings_test.go b/shared_strings_test.go index 8c3ce20..8208da9 100644 --- a/shared_strings_test.go +++ b/shared_strings_test.go @@ -36,9 +36,9 @@ func TestLoadingSharedStrings(t *testing.T) { for name, filename := range sharedStringsTests { t.Run(name, func(t *testing.T) { actual, err := OpenFile(filename) + require.NoError(t, err) defer actual.Close() - require.NoError(t, err) require.Equal(t, []string{"rec_id", "culture", "sex"}, actual.sharedStrings) }) } diff --git a/sheets.go b/sheets.go index ad4587e..a3090ff 100644 --- a/sheets.go +++ b/sheets.go @@ -39,7 +39,7 @@ func getFileNameFromRelationships(rels []relationship, s sheet) (string, error) return "xl/" + rel.Target, nil } } - return "", fmt.Errorf("Unable to find file with relationship %s", s.RelationshipID) + return "", fmt.Errorf("unable to find file with relationship %s", s.RelationshipID) } // getWorksheets loads the workbook.xml file and extracts a list of worksheets, along @@ -56,7 +56,7 @@ func getWorksheets(files []*zip.File) ([]string, *map[string]*zip.File, error) { return nil, nil, err } - wb := workbook{} + var wb workbook err = xml.Unmarshal(data, &wb) if err != nil { return nil, nil, err @@ -77,7 +77,7 @@ func getWorksheets(files []*zip.File) ([]string, *map[string]*zip.File, error) { return nil, nil, err } - wsFileMap := make(map[string]*zip.File) + wsFileMap := map[string]*zip.File{} sheetNames := make([]string, len(wb.Sheets)) for i, sheet := range wb.Sheets { diff --git a/styles.go b/styles.go index b119a65..b6e0c37 100644 --- a/styles.go +++ b/styles.go @@ -47,7 +47,7 @@ func isDateFormatCode(formatCode string) bool { // getDateStylesFromStyleSheet populates a map of all date related styles, based on their // style sheet index. func getDateStylesFromStyleSheet(ss *styleSheet) *map[int]bool { - dateStyles := make(map[int]bool) + dateStyles := map[int]bool{} for i, style := range ss.CellStyles { if 14 <= style.NumberFormatID && style.NumberFormatID <= 22 { diff --git a/xml.go b/xml.go new file mode 100644 index 0000000..772eb11 --- /dev/null +++ b/xml.go @@ -0,0 +1,17 @@ +package xlsxreader + +import "encoding/xml" + +func getCharData(d *xml.Decoder) (string, error) { + rawToken, err := d.RawToken() + if err != nil { + return "", err + } + + cdata, ok := rawToken.(xml.CharData) + if !ok { + return "", xml.UnmarshalError("expected chardata to be present, but none was found") + } + + return string(cdata), nil +}