-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: preview/pagination optimization on huge excel file [TCTC-1973] (#…
…86) * feat: optimize the excel reader * test: added more data in the excel fixture * tests: split tests for excel metadatas * fix: fix other tests + clean/shrink loops * clear: removed some compact code for clear view * clean: clean way to handle skipfooter at the bottom * fix: remove another overdoing loop * test: adapt tests from excel fixture * fix: fix errors and adapt the code for read_excel expected output * feat: added more tests for coverage * fix: clean/refacto + fix some accordinly to tests * test: add more tests and explanations for the coverage * fix a test for the total_rows on excel * feat: completely removed pandas read_excel/Excelfile support * type: handle datetime * feat: clean columns extraction * clean: clean stuffs from review * feat: explicit iteration * doc: added comment for removing comma + clean * feat: infering type for cell * test: add coverage for specials types * test(excel): add floats to types fixtures cases Co-authored-by: David Nowinsky <[email protected]>
- Loading branch information
1 parent
9e28504
commit b1ff48b
Showing
6 changed files
with
422 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,70 +1,321 @@ | ||
""" | ||
Module to add excel files support | ||
""" | ||
import datetime | ||
import logging | ||
from functools import wraps | ||
from typing import TYPE_CHECKING, Any, Dict, Optional, Union | ||
from io import StringIO | ||
from typing import Any, Dict, Generator, List, Optional, Tuple, Union | ||
|
||
import openpyxl | ||
import pandas as pd | ||
import xlrd | ||
from openpyxl.utils.exceptions import InvalidFileException | ||
|
||
if TYPE_CHECKING: | ||
from os import PathLike | ||
LOGGER = logging.getLogger(__name__) | ||
|
||
FilePathOrBuffer = Union[str, bytes, PathLike[str], PathLike[bytes]] | ||
|
||
LOGGER = logging.getLogger(__name__) | ||
def _old_xls_rows_iterator( | ||
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], | ||
sh_name: str, | ||
preview_nrows: Optional[int], | ||
preview_offset: Optional[int], | ||
) -> Generator[Any, Any, Any]: | ||
""" | ||
Depending on paginations inputs (preview_rows, preview_offset), we want to | ||
get an iterator object to loop on target rows, here we're returning an iterator | ||
using yield for each iteration in the workbook | ||
""" | ||
|
||
if preview_nrows is None and preview_offset is not None: | ||
to_iter = range(preview_offset, wb.sheet_by_name(sh_name).nrows) | ||
elif preview_nrows is not None and preview_offset is not None: | ||
to_iter = range(preview_offset, preview_offset + preview_nrows + 1) | ||
elif preview_nrows is not None and preview_offset is None: | ||
to_iter = range(preview_nrows + 1) | ||
else: | ||
to_iter = range(wb.sheet_by_name(sh_name).nrows) | ||
|
||
for rx in to_iter: | ||
yield wb.sheet_by_name(sh_name).row(rx) | ||
|
||
|
||
def _new_xls_rows_iterator( | ||
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], | ||
sh_name: str, | ||
preview_nrows: Optional[int], | ||
preview_offset: Optional[int], | ||
) -> Generator[Any, Any, Any]: | ||
""" | ||
Depending on paginations inputs (preview_rows, preview_offset), we want to | ||
get an iterator object to loop on target rows, here we're returning an iterator | ||
from the iter_rows built-in function from openpyxl | ||
""" | ||
|
||
# +1 are here because this is 1-based indexing | ||
if preview_nrows is not None and preview_offset is not None: | ||
max_row = preview_offset + 1 + preview_nrows | ||
elif preview_nrows is not None and preview_offset is None: | ||
max_row = preview_nrows + 1 | ||
else: | ||
max_row = None | ||
|
||
if preview_offset: | ||
min_row = preview_offset + 1 | ||
else: | ||
min_row = None | ||
|
||
# Then we return the generator | ||
yield wb[sh_name].iter_rows( | ||
min_row=min_row, | ||
max_row=max_row, | ||
values_only=True, | ||
) | ||
|
||
|
||
def _get_rows_iterator( | ||
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], | ||
sheet_name: str, | ||
preview_nrows: Optional[int], | ||
preview_offset: Optional[int], | ||
) -> Generator[Any, Any, Any]: | ||
""" | ||
Depending on the excel type either it's the new format or the old one, | ||
this method will return an iterator to read on its rows | ||
""" | ||
|
||
if isinstance(wb, xlrd.book.Book): | ||
return _old_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset) | ||
|
||
return _new_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset) | ||
|
||
|
||
def _build_row_subset( | ||
row: Union[List[Any], Tuple[Any]], | ||
sh_name: str, | ||
sheetnames: List[str], | ||
row_number: int, | ||
row_subset: List[str], | ||
) -> List[str]: | ||
""" | ||
This method will build each row and add an extra row for the sheet_name | ||
If we're in an excel with multiple sheets | ||
""" | ||
|
||
def _infer_type(cell_value: Any) -> Any: | ||
value = str(cell_value) | ||
if type(cell_value) in [int, float, str]: | ||
# we're removing "," from cells because we're going to be using comma as seperator for our csv payload | ||
# and if we keep some cells with comma, it could generate fake mismatch errors on columns... | ||
value = str(cell_value).replace(",", " ") | ||
elif type(cell_value) == bool: | ||
# we're assuming "True" and "False" will be considered as booleans | ||
value = f'"{cell_value}"' | ||
elif type(cell_value) in [datetime.datetime]: | ||
# in teh context of only preview, i think it's okay to | ||
# just have a representation of the date | ||
value = cell_value.strftime("%m/%d/%Y %H:%M:%S") | ||
|
||
return value | ||
|
||
cells = [ | ||
_infer_type(cell.value) | ||
if type(cell) not in [str, int, float, bool, datetime.datetime] and cell is not None | ||
else _infer_type(cell) | ||
for cell in row | ||
] | ||
|
||
if len(sheetnames) > 1: | ||
row_subset.append(f'{",".join([*cells, sh_name])}\n') | ||
else: | ||
row_subset.append(f'{",".join(cells)}\n') | ||
|
||
return row_subset | ||
|
||
|
||
def _get_row_subset_per_sheet( | ||
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], | ||
sh_name: str, | ||
sheetnames: List[str], | ||
preview_nrows: Optional[int], | ||
preview_offset: Optional[int], | ||
row_subset: List[str], | ||
skiprows: Optional[int] = None, | ||
nrows: Optional[int] = None, | ||
skipfooter: int = 0, | ||
) -> List[str]: | ||
""" | ||
This method will get an iterator from the workbook and | ||
construct a list of row inside row_subset | ||
""" | ||
# we get the row iterator from here | ||
row_iterator = _get_rows_iterator(wb, sh_name, preview_nrows, preview_offset) | ||
|
||
def __loop_and_fill_row_subsets(row_subset: List[str], loop_on: Any) -> List[str]: | ||
headers_skipped = False | ||
for row_number, row in loop_on: | ||
# We want to skip the headers if we're in another sheet | ||
if not headers_skipped: | ||
headers_skipped = True | ||
continue | ||
if skiprows: | ||
if row_number <= skiprows: | ||
continue | ||
row_subset = _build_row_subset(row, sh_name, sheetnames, row_number, row_subset) | ||
if nrows: | ||
if row_number == nrows: | ||
break | ||
|
||
return row_subset | ||
|
||
if isinstance(wb, openpyxl.workbook.Workbook): | ||
for row_iter in row_iterator: | ||
row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iter)) | ||
else: | ||
row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iterator)) | ||
|
||
# to handle the skipfooter | ||
lines_to_keep = len(row_subset) - skipfooter | ||
|
||
return row_subset[:lines_to_keep] | ||
|
||
|
||
def _read_sheets( | ||
wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], | ||
sheet_names: List[Any], | ||
preview_nrows: Optional[int], | ||
preview_offset: Optional[int], | ||
nrows: Optional[int] = None, | ||
skiprows: Optional[int] = None, | ||
skipfooter: int = 0, | ||
) -> List[Any]: | ||
""" | ||
This method will loop over sheets, read content and return a list of rows | ||
depending on your inputs | ||
""" | ||
|
||
row_subset: List[str] = [] | ||
|
||
for sh_name in sheet_names: | ||
row_subset = _get_row_subset_per_sheet( | ||
wb, | ||
sh_name, | ||
sheet_names, | ||
preview_nrows, | ||
preview_offset, | ||
row_subset, | ||
skiprows, | ||
nrows, | ||
skipfooter, | ||
) | ||
|
||
if isinstance(wb, openpyxl.workbook.Workbook): | ||
wb.close() | ||
|
||
return row_subset | ||
|
||
|
||
@wraps(pd.read_excel) | ||
def read_excel( | ||
filepath_or_buffer: "FilePathOrBuffer", | ||
filepath: str, | ||
*, | ||
# extra `peakina` reader kwargs | ||
preview_offset: int = 0, | ||
preview_nrows: Optional[int] = None, | ||
# change of default values | ||
keep_default_na: bool = False, # pandas default: `True` | ||
**kwargs: Any, | ||
preview_offset: Optional[int] = None, | ||
sheet_name: str = "", | ||
na_values: Any = None, | ||
keep_default_na: bool = False, | ||
skiprows: Optional[int] = None, | ||
nrows: Optional[int] = None, | ||
skipfooter: int = 0, | ||
) -> pd.DataFrame: | ||
df = pd.read_excel( | ||
filepath_or_buffer, | ||
keep_default_na=keep_default_na, | ||
**kwargs, | ||
""" | ||
Uses openpyxl (with xlrd as fallback) to convert the excel sheet into a csv string. | ||
This csv is then read by pandas to make a DataFrame. | ||
Using this two steps, we are able to obtain better performance than pd.read_excel alone. | ||
Also, these two libraries are able to read only the top of each sheet, | ||
so we can create previews without reading the whole file. | ||
""" | ||
|
||
column_names = [] | ||
|
||
try: | ||
wb = openpyxl.load_workbook(filepath, read_only=True) | ||
all_sheet_names = wb.sheetnames | ||
|
||
# we get column names with the iterator | ||
for sh_name in all_sheet_names: | ||
for column_list in [list(c) for c in wb[sh_name].iter_rows(min_row=1, max_row=1)]: | ||
for co in column_list: | ||
if co.value not in column_names: | ||
column_names.append(co.value) | ||
|
||
except InvalidFileException as e: | ||
LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e) | ||
wb = xlrd.open_workbook(filepath) | ||
all_sheet_names = wb.sheet_names() | ||
|
||
for sh_name in all_sheet_names: | ||
column_names += [ | ||
c.value for c in wb.sheet_by_name(sh_name).row(0) if c.value not in column_names | ||
] | ||
|
||
sheet_names = [sheet_name] if sheet_name else all_sheet_names | ||
if len(all_sheet_names) > 1: | ||
sheet_names = [all_sheet_names[0]] if sheet_name == "" else sheet_names | ||
|
||
row_subset = _read_sheets( | ||
wb, sheet_names, preview_nrows, preview_offset, nrows, skiprows, skipfooter | ||
) | ||
# if there are several sheets, pf.read_excel returns a dict {sheet_name: df} | ||
if isinstance(df, dict): | ||
for sheet_name, sheet_df in df.items(): | ||
sheet_df["__sheet__"] = sheet_name | ||
df = pd.concat(df.values(), sort=False) | ||
|
||
if preview_nrows is not None or preview_offset: | ||
offset = None if preview_nrows is None else preview_offset + preview_nrows | ||
return df[preview_offset:offset] | ||
return df | ||
if sheet_name is None: | ||
if "__sheet__" not in column_names: # type: ignore | ||
column_names.append("__sheet__") | ||
|
||
columns_kwargs = { | ||
"header": None, | ||
"names": column_names, | ||
} | ||
|
||
return pd.read_csv( | ||
StringIO("\n".join(row_subset)), | ||
nrows=nrows, | ||
na_values=na_values, | ||
keep_default_na=keep_default_na, | ||
true_values=["True"], | ||
false_values=["False"], | ||
**columns_kwargs, | ||
) | ||
|
||
def excel_meta( | ||
filepath_or_buffer: "FilePathOrBuffer", reader_kwargs: Dict[str, Any] | ||
) -> Dict[str, Any]: | ||
|
||
def excel_meta(filepath: str, reader_kwargs: Dict[str, Any]) -> Dict[str, Any]: | ||
""" | ||
Returns a dictionary with the meta information of the excel file. | ||
""" | ||
excel_file = pd.ExcelFile(filepath_or_buffer) | ||
sheet_names = excel_file.sheet_names | ||
|
||
df = read_excel(excel_file, **reader_kwargs) | ||
total_rows = 0 | ||
try: | ||
wb = openpyxl.load_workbook(filepath, read_only=True) | ||
for sheet in wb.worksheets: | ||
total_rows += sheet.max_row | ||
sheet_names = wb.sheetnames | ||
except InvalidFileException as e: | ||
LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e) | ||
wb = xlrd.open_workbook(filepath) | ||
sheet_names = wb.sheet_names() | ||
for sheet in sheet_names: | ||
total_rows += wb.sheet_by_name(sheet).nrows | ||
|
||
if (sheet_name := reader_kwargs.get("sheet_name", 0)) is None: | ||
# multiple sheets together | ||
return { | ||
"sheetnames": sheet_names, | ||
"df_rows": df.shape[0], | ||
"total_rows": sum(excel_file.parse(sheet_name).shape[0] for sheet_name in sheet_names), | ||
} | ||
else: | ||
# single sheet | ||
return { | ||
"sheetnames": sheet_names, | ||
"df_rows": df.shape[0], | ||
"total_rows": excel_file.parse(sheet_name).shape[0], | ||
} | ||
# to not count headers of sheets as rows: | ||
total_rows -= len(sheet_names) | ||
|
||
df = read_excel(filepath, **reader_kwargs) | ||
|
||
return { | ||
"sheetnames": sheet_names, | ||
"df_rows": df.shape[0], | ||
"total_rows": total_rows, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "peakina" | ||
version = "0.7.2" | ||
version = "0.7.3" | ||
description = "pandas readers on steroids (remote files, glob patterns, cache, etc.)" | ||
authors = ["Toucan Toco <[email protected]>"] | ||
readme = "README.md" | ||
|
Binary file not shown.
Binary file not shown.
Oops, something went wrong.