Skip to content

Commit

Permalink
feat: preview/pagination optimization on huge excel file [TCTC-1973] (#…
Browse files Browse the repository at this point in the history
…86)

* feat: optimize the excel reader

* test: added more data in the excel fixture

* tests: split tests for excel metadatas

* fix: fix other tests + clean/shrink loops

* clear: removed some compact code for clear view

* clean: clean way to handle skipfooter at the bottom

* fix: remove another overdoing loop

* test: adapt tests from excel fixture

* fix: fix errors and adapt the code for read_excel expected output

* feat: added more tests for coverage

* fix: clean/refacto + fix some accordinly to tests

* test: add more tests and explanations for the coverage

* fix a test for the total_rows on excel

* feat: completely removed pandas read_excel/Excelfile support

* type: handle datetime

* feat: clean columns extraction

* clean: clean stuffs from review

* feat: explicit iteration

* doc: added comment for removing comma + clean

* feat: infering type for cell

* test: add coverage for specials types

* test(excel): add floats to types fixtures cases

Co-authored-by: David Nowinsky <[email protected]>
  • Loading branch information
Sanix-Darker and davinov authored Mar 2, 2022
1 parent 9e28504 commit b1ff48b
Show file tree
Hide file tree
Showing 6 changed files with 422 additions and 66 deletions.
343 changes: 297 additions & 46 deletions peakina/readers/excel.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,321 @@
"""
Module to add excel files support
"""
import datetime
import logging
from functools import wraps
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
from io import StringIO
from typing import Any, Dict, Generator, List, Optional, Tuple, Union

import openpyxl
import pandas as pd
import xlrd
from openpyxl.utils.exceptions import InvalidFileException

if TYPE_CHECKING:
from os import PathLike
LOGGER = logging.getLogger(__name__)

FilePathOrBuffer = Union[str, bytes, PathLike[str], PathLike[bytes]]

LOGGER = logging.getLogger(__name__)
def _old_xls_rows_iterator(
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book],
sh_name: str,
preview_nrows: Optional[int],
preview_offset: Optional[int],
) -> Generator[Any, Any, Any]:
"""
Depending on paginations inputs (preview_rows, preview_offset), we want to
get an iterator object to loop on target rows, here we're returning an iterator
using yield for each iteration in the workbook
"""

if preview_nrows is None and preview_offset is not None:
to_iter = range(preview_offset, wb.sheet_by_name(sh_name).nrows)
elif preview_nrows is not None and preview_offset is not None:
to_iter = range(preview_offset, preview_offset + preview_nrows + 1)
elif preview_nrows is not None and preview_offset is None:
to_iter = range(preview_nrows + 1)
else:
to_iter = range(wb.sheet_by_name(sh_name).nrows)

for rx in to_iter:
yield wb.sheet_by_name(sh_name).row(rx)


def _new_xls_rows_iterator(
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book],
sh_name: str,
preview_nrows: Optional[int],
preview_offset: Optional[int],
) -> Generator[Any, Any, Any]:
"""
Depending on paginations inputs (preview_rows, preview_offset), we want to
get an iterator object to loop on target rows, here we're returning an iterator
from the iter_rows built-in function from openpyxl
"""

# +1 are here because this is 1-based indexing
if preview_nrows is not None and preview_offset is not None:
max_row = preview_offset + 1 + preview_nrows
elif preview_nrows is not None and preview_offset is None:
max_row = preview_nrows + 1
else:
max_row = None

if preview_offset:
min_row = preview_offset + 1
else:
min_row = None

# Then we return the generator
yield wb[sh_name].iter_rows(
min_row=min_row,
max_row=max_row,
values_only=True,
)


def _get_rows_iterator(
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book],
sheet_name: str,
preview_nrows: Optional[int],
preview_offset: Optional[int],
) -> Generator[Any, Any, Any]:
"""
Depending on the excel type either it's the new format or the old one,
this method will return an iterator to read on its rows
"""

if isinstance(wb, xlrd.book.Book):
return _old_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset)

return _new_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset)


def _build_row_subset(
row: Union[List[Any], Tuple[Any]],
sh_name: str,
sheetnames: List[str],
row_number: int,
row_subset: List[str],
) -> List[str]:
"""
This method will build each row and add an extra row for the sheet_name
If we're in an excel with multiple sheets
"""

def _infer_type(cell_value: Any) -> Any:
value = str(cell_value)
if type(cell_value) in [int, float, str]:
# we're removing "," from cells because we're going to be using comma as seperator for our csv payload
# and if we keep some cells with comma, it could generate fake mismatch errors on columns...
value = str(cell_value).replace(",", " ")
elif type(cell_value) == bool:
# we're assuming "True" and "False" will be considered as booleans
value = f'"{cell_value}"'
elif type(cell_value) in [datetime.datetime]:
# in teh context of only preview, i think it's okay to
# just have a representation of the date
value = cell_value.strftime("%m/%d/%Y %H:%M:%S")

return value

cells = [
_infer_type(cell.value)
if type(cell) not in [str, int, float, bool, datetime.datetime] and cell is not None
else _infer_type(cell)
for cell in row
]

if len(sheetnames) > 1:
row_subset.append(f'{",".join([*cells, sh_name])}\n')
else:
row_subset.append(f'{",".join(cells)}\n')

return row_subset


def _get_row_subset_per_sheet(
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book],
sh_name: str,
sheetnames: List[str],
preview_nrows: Optional[int],
preview_offset: Optional[int],
row_subset: List[str],
skiprows: Optional[int] = None,
nrows: Optional[int] = None,
skipfooter: int = 0,
) -> List[str]:
"""
This method will get an iterator from the workbook and
construct a list of row inside row_subset
"""
# we get the row iterator from here
row_iterator = _get_rows_iterator(wb, sh_name, preview_nrows, preview_offset)

def __loop_and_fill_row_subsets(row_subset: List[str], loop_on: Any) -> List[str]:
headers_skipped = False
for row_number, row in loop_on:
# We want to skip the headers if we're in another sheet
if not headers_skipped:
headers_skipped = True
continue
if skiprows:
if row_number <= skiprows:
continue
row_subset = _build_row_subset(row, sh_name, sheetnames, row_number, row_subset)
if nrows:
if row_number == nrows:
break

return row_subset

if isinstance(wb, openpyxl.workbook.Workbook):
for row_iter in row_iterator:
row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iter))
else:
row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iterator))

# to handle the skipfooter
lines_to_keep = len(row_subset) - skipfooter

return row_subset[:lines_to_keep]


def _read_sheets(
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book],
sheet_names: List[Any],
preview_nrows: Optional[int],
preview_offset: Optional[int],
nrows: Optional[int] = None,
skiprows: Optional[int] = None,
skipfooter: int = 0,
) -> List[Any]:
"""
This method will loop over sheets, read content and return a list of rows
depending on your inputs
"""

row_subset: List[str] = []

for sh_name in sheet_names:
row_subset = _get_row_subset_per_sheet(
wb,
sh_name,
sheet_names,
preview_nrows,
preview_offset,
row_subset,
skiprows,
nrows,
skipfooter,
)

if isinstance(wb, openpyxl.workbook.Workbook):
wb.close()

return row_subset


@wraps(pd.read_excel)
def read_excel(
filepath_or_buffer: "FilePathOrBuffer",
filepath: str,
*,
# extra `peakina` reader kwargs
preview_offset: int = 0,
preview_nrows: Optional[int] = None,
# change of default values
keep_default_na: bool = False, # pandas default: `True`
**kwargs: Any,
preview_offset: Optional[int] = None,
sheet_name: str = "",
na_values: Any = None,
keep_default_na: bool = False,
skiprows: Optional[int] = None,
nrows: Optional[int] = None,
skipfooter: int = 0,
) -> pd.DataFrame:
df = pd.read_excel(
filepath_or_buffer,
keep_default_na=keep_default_na,
**kwargs,
"""
Uses openpyxl (with xlrd as fallback) to convert the excel sheet into a csv string.
This csv is then read by pandas to make a DataFrame.
Using this two steps, we are able to obtain better performance than pd.read_excel alone.
Also, these two libraries are able to read only the top of each sheet,
so we can create previews without reading the whole file.
"""

column_names = []

try:
wb = openpyxl.load_workbook(filepath, read_only=True)
all_sheet_names = wb.sheetnames

# we get column names with the iterator
for sh_name in all_sheet_names:
for column_list in [list(c) for c in wb[sh_name].iter_rows(min_row=1, max_row=1)]:
for co in column_list:
if co.value not in column_names:
column_names.append(co.value)

except InvalidFileException as e:
LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e)
wb = xlrd.open_workbook(filepath)
all_sheet_names = wb.sheet_names()

for sh_name in all_sheet_names:
column_names += [
c.value for c in wb.sheet_by_name(sh_name).row(0) if c.value not in column_names
]

sheet_names = [sheet_name] if sheet_name else all_sheet_names
if len(all_sheet_names) > 1:
sheet_names = [all_sheet_names[0]] if sheet_name == "" else sheet_names

row_subset = _read_sheets(
wb, sheet_names, preview_nrows, preview_offset, nrows, skiprows, skipfooter
)
# if there are several sheets, pf.read_excel returns a dict {sheet_name: df}
if isinstance(df, dict):
for sheet_name, sheet_df in df.items():
sheet_df["__sheet__"] = sheet_name
df = pd.concat(df.values(), sort=False)

if preview_nrows is not None or preview_offset:
offset = None if preview_nrows is None else preview_offset + preview_nrows
return df[preview_offset:offset]
return df
if sheet_name is None:
if "__sheet__" not in column_names: # type: ignore
column_names.append("__sheet__")

columns_kwargs = {
"header": None,
"names": column_names,
}

return pd.read_csv(
StringIO("\n".join(row_subset)),
nrows=nrows,
na_values=na_values,
keep_default_na=keep_default_na,
true_values=["True"],
false_values=["False"],
**columns_kwargs,
)

def excel_meta(
filepath_or_buffer: "FilePathOrBuffer", reader_kwargs: Dict[str, Any]
) -> Dict[str, Any]:

def excel_meta(filepath: str, reader_kwargs: Dict[str, Any]) -> Dict[str, Any]:
"""
Returns a dictionary with the meta information of the excel file.
"""
excel_file = pd.ExcelFile(filepath_or_buffer)
sheet_names = excel_file.sheet_names

df = read_excel(excel_file, **reader_kwargs)
total_rows = 0
try:
wb = openpyxl.load_workbook(filepath, read_only=True)
for sheet in wb.worksheets:
total_rows += sheet.max_row
sheet_names = wb.sheetnames
except InvalidFileException as e:
LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e)
wb = xlrd.open_workbook(filepath)
sheet_names = wb.sheet_names()
for sheet in sheet_names:
total_rows += wb.sheet_by_name(sheet).nrows

if (sheet_name := reader_kwargs.get("sheet_name", 0)) is None:
# multiple sheets together
return {
"sheetnames": sheet_names,
"df_rows": df.shape[0],
"total_rows": sum(excel_file.parse(sheet_name).shape[0] for sheet_name in sheet_names),
}
else:
# single sheet
return {
"sheetnames": sheet_names,
"df_rows": df.shape[0],
"total_rows": excel_file.parse(sheet_name).shape[0],
}
# to not count headers of sheets as rows:
total_rows -= len(sheet_names)

df = read_excel(filepath, **reader_kwargs)

return {
"sheetnames": sheet_names,
"df_rows": df.shape[0],
"total_rows": total_rows,
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "peakina"
version = "0.7.2"
version = "0.7.3"
description = "pandas readers on steroids (remote files, glob patterns, cache, etc.)"
authors = ["Toucan Toco <[email protected]>"]
readme = "README.md"
Expand Down
Binary file modified tests/fixtures/fixture-multi-sheet.xlsx
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit b1ff48b

Please sign in to comment.