Skip to content

Commit

Permalink
Improve parsing speed by manual implementing xml.Unmarshaller for som…
Browse files Browse the repository at this point in the history
…e types. (#34)

* Improve parsing speed by manual implementing xml.Unmarshaller for some types.

Implementation made for types representing row and shared string.
I've got around 25% speed improvement. Approaches taken from https://stackoverflow.com/a/61858457

* Fix up error messages, make private stuff, etc

* Move file to remove 'util' from name - too generic!

Co-authored-by: Douglas Parsons <[email protected]>
  • Loading branch information
xakep666 and dglsparsons authored Aug 6, 2021
1 parent 461e4ef commit 8e6e706
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 43 deletions.
20 changes: 9 additions & 11 deletions file.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
type XlsxFile struct {
Sheets []string

zipReader *zip.Reader
sheetFiles map[string]*zip.File
sharedStrings []string
dateStyles map[int]bool
Expand All @@ -32,7 +31,7 @@ func getFileForName(files []*zip.File, name string) (*zip.File, error) {
}
}

return nil, fmt.Errorf("File not found: %s", name)
return nil, fmt.Errorf("file not found: %s", name)
}

// readFile opens and reads the entire contents of a *zip.File into memory.
Expand Down Expand Up @@ -70,15 +69,14 @@ func OpenFile(filename string) (*XlsxFileCloser, error) {
return nil, err
}

x := new(XlsxFile)

x := XlsxFile{}
if err := x.init(&zipFile.Reader); err != nil {
zipFile.Close()
return nil, err
}

return &XlsxFileCloser{
XlsxFile: *x,
XlsxFile: x,
zipReadCloser: zipFile,
}, nil
}
Expand All @@ -88,15 +86,15 @@ func OpenFile(filename string) (*XlsxFileCloser, error) {
// is returned.
// Note that the file must be Close()-d when you are finished with it.
func OpenReaderZip(rc *zip.ReadCloser) (*XlsxFileCloser, error) {
x := new(XlsxFile)
x := XlsxFile{}

if err := x.init(&rc.Reader); err != nil {
rc.Close()
return nil, err
}

return &XlsxFileCloser{
XlsxFile: *x,
XlsxFile: x,
zipReadCloser: rc,
}, nil
}
Expand All @@ -110,26 +108,26 @@ func NewReader(xlsxBytes []byte) (*XlsxFile, error) {
return nil, err
}

x := new(XlsxFile)
x := XlsxFile{}
err = x.init(r)
if err != nil {
return nil, err
}

return x, nil
return &x, nil
}

// NewReaderZip takes zip reader of Xlsx file and returns a populated XlsxFile struct for it.
// If the file cannot be found, or key parts of the files contents are missing, an error
// is returned.
func NewReaderZip(r *zip.Reader) (*XlsxFile, error) {
x := new(XlsxFile)
x := XlsxFile{}

if err := x.init(r); err != nil {
return nil, err
}

return x, nil
return &x, nil
}

func (x *XlsxFile) init(zipReader *zip.Reader) error {
Expand Down
15 changes: 7 additions & 8 deletions file_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,26 @@ func TestGettingFileByNameFailure(t *testing.T) {

_, err := getFileForName(zipFiles, "OOPS")

require.EqualError(t, err, "File not found: OOPS")
require.EqualError(t, err, "file not found: OOPS")

}

func TestOpeningMissingFile(t *testing.T) {
f, err := OpenFile("this_doesnt_exist.zip")
defer f.Close()

_, err := OpenFile("this_doesnt_exist.zip")
require.EqualError(t, err, "open this_doesnt_exist.zip: no such file or directory")
}

func TestHandlingSpuriousWorkbookLinks(t *testing.T) {
f, err := OpenFile("./test/test-xl-relationship-prefix.xlsx")
defer f.Close()
require.NoError(t, err)
defer f.Close()
}

func TestOpeningXlsxFile(t *testing.T) {
f, err := OpenFile("./test/test-small.xlsx")
require.NoError(t, err)
defer f.Close()

require.NoError(t, err)
require.Equal(t, []string{"datarefinery_groundtruth_400000"}, f.Sheets)
}

Expand All @@ -59,9 +57,9 @@ func TestOpeningZipReadCloser(t *testing.T) {
defer zrc.Close()

f, err := OpenReaderZip(zrc)
require.NoError(t, err)
defer f.Close()

require.NoError(t, err)
require.Equal(t, []string{"datarefinery_groundtruth_400000"}, f.Sheets)
}

Expand All @@ -73,7 +71,8 @@ func TestClosingFile(t *testing.T) {
}

func TestNewReaderFromXlsxBytes(t *testing.T) {
f, _ := os.Open("./test/test-small.xlsx")
f, err := os.Open("./test/test-small.xlsx")
require.NoError(t, err)
defer f.Close()

b, _ := ioutil.ReadAll(f)
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/thedatashed/xlsxreader

go 1.15

require github.com/stretchr/testify v1.3.0
164 changes: 154 additions & 10 deletions rows.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,51 @@ type rawRow struct {
RawCells []rawCell `xml:"c"`
}

func (rr *rawRow) unmarshalXML(d *xml.Decoder, start xml.StartElement) error {
for _, attr := range start.Attr {
if attr.Name.Local != "r" {
continue
}

var err error

if rr.Index, err = strconv.Atoi(attr.Value); err != nil {
return err
}
}

for {
tok, err := d.Token()
if err != nil {
return err
}

var se xml.StartElement

switch el := tok.(type) {
case xml.StartElement:
se = el
case xml.EndElement:
if el == start.End() {
return nil
}
default:
continue
}

if se.Name.Local != "c" {
continue
}

var rc rawCell
if err = rc.unmarshalXML(d, se); err != nil {
return err
}

rr.RawCells = append(rr.RawCells, rc)
}
}

// rawCell represents the raw XML element for parsing a cell.
type rawCell struct {
Reference string `xml:"r,attr"` // E.g. A1
Expand All @@ -22,6 +67,99 @@ type rawCell struct {
InlineString *string `xml:"is>t"`
}

func (rc *rawCell) unmarshalXML(d *xml.Decoder, start xml.StartElement) error {
// unmarshal attributes
for _, attr := range start.Attr {
switch attr.Name.Local {
case "r":
rc.Reference = attr.Value
case "t":
rc.Type = attr.Value
case "s":
var err error

if rc.Style, err = strconv.Atoi(attr.Value); err != nil {
return err
}
}
}

for {
tok, err := d.Token()
if err != nil {
return err
}

var se xml.StartElement

switch el := tok.(type) {
case xml.StartElement:
se = el
case xml.EndElement:
if el == start.End() {
return nil
}
continue
default:
continue
}

switch se.Name.Local {
case "is":
err = rc.unmarshalInlineString(d, se)
case "v":
var v string

if v, err = getCharData(d); err != nil {
return err
}

rc.Value = &v
default:
continue
}

if err != nil {
return err
}
}
}

func (rc *rawCell) unmarshalInlineString(d *xml.Decoder, start xml.StartElement) error {
for {
tok, err := d.Token()
if err != nil {
return err
}

var se xml.StartElement

switch el := tok.(type) {
case xml.StartElement:
se = el
case xml.EndElement:
if el == start.End() {
return nil
}
continue
default:
continue
}

if se.Name.Local != "t" {
continue
}

v, err := getCharData(d)
if err != nil {
return err
}

rc.InlineString = &v
return nil
}
}

// Row represents a row of data read from an Xlsx file, in a consumable format
type Row struct {
Error error
Expand All @@ -37,15 +175,21 @@ type Cell struct {
Type CellType
}

// CellType defines the data type of an excel cell
type CellType string

const (
TypeString CellType = "string"
// TypeString is for text cells
TypeString CellType = "string"
// TypeNumerical is for numerical values
TypeNumerical CellType = "numerical"
TypeDateTime CellType = "datetime"
TypeBoolean CellType = "boolean"
// TypeDateTime is for date values
TypeDateTime CellType = "datetime"
// TypeBoolean is for true/false values
TypeBoolean CellType = "boolean"
)

// ColumnIndex gives a number, representing the column the cell lies beneath.
func (c Cell) ColumnIndex() int {
return asIndex(c.Column)
}
Expand All @@ -57,13 +201,13 @@ func (c Cell) ColumnIndex() int {
func (x *XlsxFile) getCellValue(r rawCell) (string, error) {
if r.Type == "inlineStr" {
if r.InlineString == nil {
return "", fmt.Errorf("Cell had type of InlineString, but the InlineString attribute was missing")
return "", fmt.Errorf("cell had type of InlineString, but the InlineString attribute was missing")
}
return *r.InlineString, nil
}

if r.Value == nil {
return "", fmt.Errorf("Unable to get cell value for cell %s - no value element found", r.Reference)
return "", fmt.Errorf("unable to get cell value for cell %s - no value element found", r.Reference)
}

if r.Type == "s" {
Expand All @@ -72,7 +216,7 @@ func (x *XlsxFile) getCellValue(r rawCell) (string, error) {
return "", err
}
if len(x.sharedStrings) <= index {
return "", fmt.Errorf("Attempted to index value %d in shared strings of length %d",
return "", fmt.Errorf("attempted to index value %d in shared strings of length %d",
index, len(x.sharedStrings))
}

Expand Down Expand Up @@ -118,7 +262,7 @@ func (x *XlsxFile) readSheetRows(sheet string, ch chan<- Row) {
file, ok := x.sheetFiles[sheet]
if !ok {
ch <- Row{
Error: fmt.Errorf("Unable to open sheet %s", sheet),
Error: fmt.Errorf("unable to open sheet %s", sheet),
}
return
}
Expand Down Expand Up @@ -157,8 +301,8 @@ func (x *XlsxFile) readSheetRows(sheet string, ch chan<- Row) {
// The Row struct returned will contain any errors that occurred either in
// interrogating values, or in parsing the XML.
func (x *XlsxFile) parseRow(decoder *xml.Decoder, startElement *xml.StartElement) Row {
r := rawRow{}
err := decoder.DecodeElement(&r, startElement)
var r rawRow
err := r.unmarshalXML(decoder, *startElement)
if err != nil {
return Row{
Error: err,
Expand Down Expand Up @@ -242,7 +386,7 @@ func removeNonAlpha(r rune) rune {
// cell name to cell index. 'A' -> 0, 'Z' -> 25, 'AA' -> 26
func asIndex(s string) int {
index := 0
for _, c := range []rune(s) {
for _, c := range s {
index *= 26
index += int(c) - 'A' + 1
}
Expand Down
Loading

0 comments on commit 8e6e706

Please sign in to comment.