Browse Source

Allow partial reads

Ryan Hollis 8 years ago
parent
commit
c8fa6d05ba

+ 26 - 6
file.go

@@ -25,6 +25,8 @@ type File struct {
 	DefinedNames   []*xlsxDefinedName
 }
 
+const NoRowLimit int = -1
+
 // Create a new File
 func NewFile() *File {
 	return &File{
@@ -36,31 +38,49 @@ func NewFile() *File {
 
 // OpenFile() take the name of an XLSX file and returns a populated
 // xlsx.File struct for it.
-func OpenFile(filename string) (file *File, err error) {
-	var f *zip.ReadCloser
-	f, err = zip.OpenReader(filename)
+func OpenFile(fileName string) (file *File, err error) {
+	return OpenFileWithRowLimit(fileName, NoRowLimit)
+}
+
+// OpenFileWithRowLimit() will open the file, but will only read the specified number of rows.
+// If you save this file, it will be truncated to the number of rows specified.
+func OpenFileWithRowLimit(fileName string, rowLimit int) (file *File, err error) {
+	var z *zip.ReadCloser
+	z, err = zip.OpenReader(fileName)
 	if err != nil {
 		return nil, err
 	}
-	file, err = ReadZip(f)
+	file, err = ReadZipWithRowLimit(z, rowLimit)
 	return
 }
 
 // OpenBinary() take bytes of an XLSX file and returns a populated
 // xlsx.File struct for it.
 func OpenBinary(bs []byte) (*File, error) {
+	return OpenBinaryWithRowLimit(bs, NoRowLimit)
+}
+
+// OpenBinaryWithRowLimit() take bytes of an XLSX file and returns a populated
+// xlsx.File struct for it.
+func OpenBinaryWithRowLimit(bs []byte, rowLimit int) (*File, error) {
 	r := bytes.NewReader(bs)
-	return OpenReaderAt(r, int64(r.Len()))
+	return OpenReaderAtWithRowLimit(r, int64(r.Len()), rowLimit)
 }
 
 // OpenReaderAt() take io.ReaderAt of an XLSX file and returns a populated
 // xlsx.File struct for it.
 func OpenReaderAt(r io.ReaderAt, size int64) (*File, error) {
+	return OpenReaderAtWithRowLimit(r, size, NoRowLimit)
+}
+
+// OpenReaderAtWithRowLimit() take io.ReaderAt of an XLSX file and returns a populated
+// xlsx.File struct for it.
+func OpenReaderAtWithRowLimit(r io.ReaderAt, size int64, rowLimit int) (*File, error) {
 	file, err := zip.NewReader(r, size)
 	if err != nil {
 		return nil, err
 	}
-	return ReadZipReader(file)
+	return ReadZipReaderWithRowLimit(file, rowLimit)
 }
 
 // A convenient wrapper around File.ToSlice, FileToSlice will

+ 48 - 0
file_test.go

@@ -3,6 +3,7 @@ package xlsx
 import (
 	"encoding/xml"
 	"path/filepath"
+	"time"
 
 	. "gopkg.in/check.v1"
 )
@@ -22,6 +23,53 @@ func (l *FileSuite) TestOpenFile(c *C) {
 	c.Assert(xlsxFile, NotNil)
 }
 
+func (l *FileSuite) TestPartialReadsWithFewSharedStrings(c *C) {
+	rowLimit := 10
+	start := time.Now()
+	file, err := OpenFileWithRowLimit("testdocs/large_sheet_no_shared_strings_no_dimension_tag.xlsx", rowLimit)
+	if err != nil {
+		c.Fatal(err)
+	}
+	timeSpent := time.Since(start)
+	timeLimit := 100 * time.Millisecond
+	if timeSpent > timeLimit {
+		c.Errorf("Reading %v rows from a sheet with ~31,000 rows and few shared strings took %v, must take less than %v", rowLimit, timeSpent, timeLimit)
+	}
+	if len(file.Sheets[0].Rows) != rowLimit {
+		c.Errorf("Expected sheet to have %v rows, but found %v rows", rowLimit, len(file.Sheets[0].Rows))
+	}
+}
+
+func (l *FileSuite) TestPartialReadsWithSharedStrings(c *C) {
+	rowLimit := 10
+	start := time.Now()
+	file, err := OpenFileWithRowLimit("testdocs/large_sheet_large_sharedstrings_dimension_tag.xlsx", rowLimit)
+	if err != nil {
+		c.Fatal(err)
+	}
+	timeSpent := time.Since(start)
+	timeLimit := time.Second
+	if timeSpent > timeLimit {
+		c.Errorf("Reading %v rows from a sheet with ~31,000 rows and a large shared strings took %v, must take less than %v", rowLimit, timeSpent, timeLimit)
+	}
+	// This is testing that the sheet was truncated, but it is also testing that the dimension tag was ignored.
+	// If the dimension tag is not correctly ignored, there will be 10 rows of the data, plus ~31k empty rows tacked on.
+	if len(file.Sheets[0].Rows) != rowLimit {
+		c.Errorf("Expected sheet to have %v rows, but found %v rows", rowLimit, len(file.Sheets[0].Rows))
+	}
+}
+
+func (l *FileSuite) TestPartialReadsWithFewerRowsThanRequested(c *C) {
+	rowLimit := 10
+	file, err := OpenFileWithRowLimit("testdocs/testfile.xlsx", rowLimit)
+	if err != nil {
+		c.Fatal(err)
+	}
+	if len(file.Sheets[0].Rows) != 2 {
+		c.Errorf("Expected sheet to have %v rows, but found %v rows", 2, len(file.Sheets[0].Rows))
+	}
+}
+
 func (l *FileSuite) TestOpenFileWithoutStyleAndSharedStrings(c *C) {
 	var xlsxFile *File
 	var error error

+ 75 - 12
lib.go

@@ -2,6 +2,7 @@ package xlsx
 
 import (
 	"archive/zip"
+	"bytes"
 	"encoding/xml"
 	"errors"
 	"fmt"
@@ -11,6 +12,10 @@ import (
 	"strings"
 )
 
+const (
+	sheetEnding = `</sheetData></worksheet>`
+)
+
 // XLSXReaderError is the standard error type for otherwise undefined
 // errors in the XSLX reading process.
 type XLSXReaderError struct {
@@ -205,6 +210,7 @@ func getMaxMinFromDimensionRef(ref string) (minx, miny, maxx, maxy int, err erro
 // calculateMaxMinFromWorkSheet works out the dimensions of a spreadsheet
 // that doesn't have a DimensionRef set.  The only case currently
 // known where this is true is with XLSX exported from Google Docs.
+// This is also true for XLSX files created through the streaming APIs.
 func calculateMaxMinFromWorksheet(worksheet *xlsxWorksheet) (minx, miny, maxx, maxy int, err error) {
 	// Note, this method could be very slow for large spreadsheets.
 	var x, y int
@@ -492,7 +498,7 @@ func fillCellDataFromInlineString(rawcell xlsxC, cell *Cell) {
 // rows from a XSLXWorksheet, populates them with Cells and resolves
 // the value references from the reference table and stores them in
 // the rows and columns.
-func readRowsFromSheet(Worksheet *xlsxWorksheet, file *File, sheet *Sheet) ([]*Row, []*Col, int, int) {
+func readRowsFromSheet(Worksheet *xlsxWorksheet, file *File, sheet *Sheet, rowLimit int) ([]*Row, []*Col, int, int) {
 	var rows []*Row
 	var cols []*Col
 	var row *Row
@@ -506,7 +512,7 @@ func readRowsFromSheet(Worksheet *xlsxWorksheet, file *File, sheet *Sheet) ([]*R
 		return nil, nil, 0, 0
 	}
 	reftable = file.referenceTable
-	if len(Worksheet.Dimension.Ref) > 0 && len(strings.Split(Worksheet.Dimension.Ref, ":")) == 2 {
+	if len(Worksheet.Dimension.Ref) > 0 && len(strings.Split(Worksheet.Dimension.Ref, ":")) == 2 && rowLimit == NoRowLimit {
 		minCol, minRow, maxCol, maxRow, err = getMaxMinFromDimensionRef(Worksheet.Dimension.Ref)
 	} else {
 		minCol, minRow, maxCol, maxRow, err = calculateMaxMinFromWorksheet(Worksheet)
@@ -659,7 +665,7 @@ func readSheetViews(xSheetViews xlsxSheetViews) []SheetView {
 // into a Sheet struct.  This work can be done in parallel and so
 // readSheetsFromZipFile will spawn an instance of this function per
 // sheet and get the results back on the provided channel.
-func readSheetFromFile(sc chan *indexedSheet, index int, rsheet xlsxSheet, fi *File, sheetXMLMap map[string]string) (errRes error) {
+func readSheetFromFile(sc chan *indexedSheet, index int, rsheet xlsxSheet, fi *File, sheetXMLMap map[string]string, rowLimit int) (errRes error) {
 	result := &indexedSheet{Index: index, Sheet: nil, Error: nil}
 	defer func() {
 		if e := recover(); e != nil {
@@ -676,15 +682,15 @@ func readSheetFromFile(sc chan *indexedSheet, index int, rsheet xlsxSheet, fi *F
 		}
 	}()
 
-	worksheet, error := getWorksheetFromSheet(rsheet, fi.worksheets, sheetXMLMap)
-	if error != nil {
-		result.Error = error
+	worksheet, err := getWorksheetFromSheet(rsheet, fi.worksheets, sheetXMLMap, rowLimit)
+	if err != nil {
+		result.Error = err
 		sc <- result
-		return error
+		return err
 	}
 	sheet := new(Sheet)
 	sheet.File = fi
-	sheet.Rows, sheet.Cols, sheet.MaxCol, sheet.MaxRow = readRowsFromSheet(worksheet, fi, sheet)
+	sheet.Rows, sheet.Cols, sheet.MaxCol, sheet.MaxRow = readRowsFromSheet(worksheet, fi, sheet, rowLimit)
 	sheet.Hidden = rsheet.State == sheetStateHidden || rsheet.State == sheetStateVeryHidden
 	sheet.SheetViews = readSheetViews(worksheet.SheetViews)
 
@@ -701,7 +707,7 @@ func readSheetFromFile(sc chan *indexedSheet, index int, rsheet xlsxSheet, fi *F
 // readSheetsFromZipFile is an internal helper function that loops
 // over the Worksheets defined in the XSLXWorkbook and loads them into
 // Sheet objects stored in the Sheets slice of a xlsx.File struct.
-func readSheetsFromZipFile(f *zip.File, file *File, sheetXMLMap map[string]string) (map[string]*Sheet, []*Sheet, error) {
+func readSheetsFromZipFile(f *zip.File, file *File, sheetXMLMap map[string]string, rowLimit int) (map[string]*Sheet, []*Sheet, error) {
 	var workbook *xlsxWorkbook
 	var err error
 	var rc io.ReadCloser
@@ -740,7 +746,7 @@ func readSheetsFromZipFile(f *zip.File, file *File, sheetXMLMap map[string]strin
 		defer close(sheetChan)
 		err = nil
 		for i, rawsheet := range workbookSheets {
-			if err := readSheetFromFile(sheetChan, i, rawsheet, file, sheetXMLMap); err != nil {
+			if err := readSheetFromFile(sheetChan, i, rawsheet, file, sheetXMLMap, rowLimit); err != nil {
 				return
 			}
 		}
@@ -909,13 +915,28 @@ func readWorkbookRelationsFromZipFile(workbookRels *zip.File) (WorkBookRels, err
 // xlsx.File struct populated with its contents.  In most cases
 // ReadZip is not used directly, but is called internally by OpenFile.
 func ReadZip(f *zip.ReadCloser) (*File, error) {
+	return ReadZipWithRowLimit(f, NoRowLimit)
+}
+
+// ReadZipWithRowLimit() takes a pointer to a zip.ReadCloser and returns a
+// xlsx.File struct populated with its contents.  In most cases
+// ReadZip is not used directly, but is called internally by OpenFile.
+func ReadZipWithRowLimit(f *zip.ReadCloser, rowLimit int) (*File, error) {
 	defer f.Close()
-	return ReadZipReader(&f.Reader)
+	return ReadZipReaderWithRowLimit(&f.Reader, rowLimit)
 }
 
 // ReadZipReader() can be used to read an XLSX in memory without
 // touching the filesystem.
 func ReadZipReader(r *zip.Reader) (*File, error) {
+	return ReadZipReaderWithRowLimit(r, NoRowLimit)
+}
+
+// ReadZipReaderWithRowLimit() can be used to read an XLSX in memory without
+// touching the filesystem.
+// rowLimit is the number of rows that should be read from the file. If rowLimit is -1, no limit is applied.
+// You can specify this with the constant NoRowLimit.
+func ReadZipReaderWithRowLimit(r *zip.Reader, rowLimit int) (*File, error) {
 	var err error
 	var file *File
 	var reftable *RefTable
@@ -986,7 +1007,7 @@ func ReadZipReader(r *zip.Reader) (*File, error) {
 
 		file.styles = style
 	}
-	sheetsByName, sheets, err = readSheetsFromZipFile(workbook, file, sheetXMLMap)
+	sheetsByName, sheets, err = readSheetsFromZipFile(workbook, file, sheetXMLMap, rowLimit)
 	if err != nil {
 		return nil, err
 	}
@@ -999,3 +1020,45 @@ func ReadZipReader(r *zip.Reader) (*File, error) {
 	file.Sheets = sheets
 	return file, nil
 }
+
+// truncateSheetXML will take in a reader to an XML sheet file and will return a reader that will read an equivalent
+// XML sheet file with only the number of rows specified. This greatly speeds up XML unmarshalling when only
+// a few rows need to be read from a large sheet.
+// When sheets are truncated, all formatting present after the sheetData tag will be lost, but all of this formatting
+// is related to printing and visibility, and is out of scope for most purposes of this library.
+func truncateSheetXML(r io.Reader, rowLimit int) (io.Reader, error) {
+	var rowCount int
+	var token xml.Token
+	var readErr error
+
+	output := new(bytes.Buffer)
+	r = io.TeeReader(r, output)
+	decoder := xml.NewDecoder(r)
+
+	for {
+		token, readErr = decoder.Token()
+		if readErr == io.EOF {
+			break
+		} else if readErr != nil {
+			return nil, readErr
+		}
+		end, ok := token.(xml.EndElement)
+		if ok && end.Name.Local == "row" {
+			rowCount++
+			if rowCount >= rowLimit {
+				break
+			}
+		}
+	}
+
+	offset := decoder.InputOffset()
+	output.Truncate(int(offset))
+
+	if readErr != io.EOF {
+		_, err := output.Write([]byte(sheetEnding))
+		if err != nil {
+			return nil, err
+		}
+	}
+	return output, nil
+}

+ 12 - 12
lib_test.go

@@ -348,7 +348,7 @@ func (l *LibSuite) TestReadRowsFromSheet(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, cols, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, cols, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 2)
 	c.Assert(maxCols, Equals, 2)
 	row := rows[0]
@@ -426,7 +426,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithMergeCells(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, _, _, _ := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, _, _ := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	row := rows[0] //
 	cell1 := row.Cells[0]
 	c.Assert(cell1.HMerge, Equals, 1)
@@ -503,7 +503,7 @@ func (l *LibSuite) TestReadRowsFromSheetBadR(c *C) {
 	sheet := new(Sheet)
 	// Discarding all return values; this test is a regression for
 	// a panic due to an "index out of range."
-	readRowsFromSheet(worksheet, file, sheet)
+	readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 }
 
 func (l *LibSuite) TestReadRowsFromSheetWithLeadingEmptyRows(c *C) {
@@ -549,7 +549,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithLeadingEmptyRows(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 5)
 	c.Assert(maxCols, Equals, 1)
 
@@ -615,7 +615,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithLeadingEmptyCols(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, cols, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, cols, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 2)
 	c.Assert(maxCols, Equals, 4)
 
@@ -754,7 +754,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithEmptyCells(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, cols, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, cols, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 3)
 	c.Assert(maxCols, Equals, 3)
 
@@ -798,7 +798,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithTrailingEmptyCells(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, _, maxCol, maxRow := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, maxCol, maxRow := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxCol, Equals, 4)
 	c.Assert(maxRow, Equals, 8)
 
@@ -908,7 +908,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithMultipleSpans(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 2)
 	c.Assert(maxCols, Equals, 4)
 	row := rows[0]
@@ -983,7 +983,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithMultipleTypes(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 1)
 	c.Assert(maxCols, Equals, 6)
 	row := rows[0]
@@ -1056,7 +1056,7 @@ func (l *LibSuite) TestReadRowsFromSheetWithHiddenColumn(c *C) {
 	file := new(File)
 	file.referenceTable = MakeSharedStringRefTable(sst)
 	sheet := new(Sheet)
-	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxRows, Equals, 1)
 	c.Assert(maxCols, Equals, 2)
 	row := rows[0]
@@ -1192,7 +1192,7 @@ func (l *LibSuite) TestSharedFormulas(c *C) {
 
 	file := new(File)
 	sheet := new(Sheet)
-	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, maxCols, maxRows := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	c.Assert(maxCols, Equals, 3)
 	c.Assert(maxRows, Equals, 2)
 
@@ -1331,7 +1331,7 @@ func (l *LibSuite) TestRowNotOverwrittenWhenFollowedByEmptyRow(c *C) {
 	file.referenceTable = MakeSharedStringRefTable(sst)
 
 	sheet := new(Sheet)
-	rows, _, _, _ := readRowsFromSheet(worksheet, file, sheet)
+	rows, _, _, _ := readRowsFromSheet(worksheet, file, sheet, NoRowLimit)
 	cells := rows[3].Cells
 
 	c.Assert(cells, HasLen, 1)

BIN
testdocs/large_sheet_large_sharedstrings_dimension_tag.xlsx


BIN
testdocs/large_sheet_no_shared_strings_no_dimension_tag.xlsx


+ 21 - 11
xmlWorkbook.go

@@ -177,27 +177,37 @@ func worksheetFileForSheet(sheet xlsxSheet, worksheets map[string]*zip.File, she
 }
 
 // getWorksheetFromSheet() is an internal helper function to open a
-// sheetN.xml file, refered to by an xlsx.xlsxSheet struct, from the XLSX
+// sheetN.xml file, referred to by an xlsx.xlsxSheet struct, from the XLSX
 // file and unmarshal it an xlsx.xlsxWorksheet struct
-func getWorksheetFromSheet(sheet xlsxSheet, worksheets map[string]*zip.File, sheetXMLMap map[string]string) (*xlsxWorksheet, error) {
-	var rc io.ReadCloser
+func getWorksheetFromSheet(sheet xlsxSheet, worksheets map[string]*zip.File, sheetXMLMap map[string]string, rowLimit int) (*xlsxWorksheet, error) {
+	var r io.Reader
 	var decoder *xml.Decoder
 	var worksheet *xlsxWorksheet
-	var error error
+	var err error
 	worksheet = new(xlsxWorksheet)
 
 	f := worksheetFileForSheet(sheet, worksheets, sheetXMLMap)
 	if f == nil {
 		return nil, fmt.Errorf("Unable to find sheet '%s'", sheet)
 	}
-	rc, error = f.Open()
-	if error != nil {
-		return nil, error
+	if rc, err := f.Open(); err != nil {
+		return nil, err
+	} else {
+		defer rc.Close()
+		r = rc
 	}
-	decoder = xml.NewDecoder(rc)
-	error = decoder.Decode(worksheet)
-	if error != nil {
-		return nil, error
+
+	if rowLimit != NoRowLimit {
+		r, err = truncateSheetXML(r, rowLimit)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	decoder = xml.NewDecoder(r)
+	err = decoder.Decode(worksheet)
+	if err != nil {
+		return nil, err
 	}
 	return worksheet, nil
 }