diff --git a/peakina/readers/excel.py b/peakina/readers/excel.py index 94123e19..a88d5b4e 100644 --- a/peakina/readers/excel.py +++ b/peakina/readers/excel.py @@ -1,70 +1,321 @@ """ Module to add excel files support """ +import datetime import logging -from functools import wraps -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from io import StringIO +from typing import Any, Dict, Generator, List, Optional, Tuple, Union +import openpyxl import pandas as pd +import xlrd +from openpyxl.utils.exceptions import InvalidFileException -if TYPE_CHECKING: - from os import PathLike +LOGGER = logging.getLogger(__name__) - FilePathOrBuffer = Union[str, bytes, PathLike[str], PathLike[bytes]] -LOGGER = logging.getLogger(__name__) +def _old_xls_rows_iterator( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sh_name: str, + preview_nrows: Optional[int], + preview_offset: Optional[int], +) -> Generator[Any, Any, Any]: + """ + Depending on paginations inputs (preview_rows, preview_offset), we want to + get an iterator object to loop on target rows, here we're returning an iterator + using yield for each iteration in the workbook + + """ + + if preview_nrows is None and preview_offset is not None: + to_iter = range(preview_offset, wb.sheet_by_name(sh_name).nrows) + elif preview_nrows is not None and preview_offset is not None: + to_iter = range(preview_offset, preview_offset + preview_nrows + 1) + elif preview_nrows is not None and preview_offset is None: + to_iter = range(preview_nrows + 1) + else: + to_iter = range(wb.sheet_by_name(sh_name).nrows) + + for rx in to_iter: + yield wb.sheet_by_name(sh_name).row(rx) + + +def _new_xls_rows_iterator( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sh_name: str, + preview_nrows: Optional[int], + preview_offset: Optional[int], +) -> Generator[Any, Any, Any]: + """ + Depending on paginations inputs (preview_rows, preview_offset), we want to + get an iterator object to loop on target rows, here we're returning an iterator + from the iter_rows built-in function from openpyxl + + """ + + # +1 are here because this is 1-based indexing + if preview_nrows is not None and preview_offset is not None: + max_row = preview_offset + 1 + preview_nrows + elif preview_nrows is not None and preview_offset is None: + max_row = preview_nrows + 1 + else: + max_row = None + + if preview_offset: + min_row = preview_offset + 1 + else: + min_row = None + + # Then we return the generator + yield wb[sh_name].iter_rows( + min_row=min_row, + max_row=max_row, + values_only=True, + ) + + +def _get_rows_iterator( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sheet_name: str, + preview_nrows: Optional[int], + preview_offset: Optional[int], +) -> Generator[Any, Any, Any]: + """ + Depending on the excel type either it's the new format or the old one, + this method will return an iterator to read on its rows + """ + + if isinstance(wb, xlrd.book.Book): + return _old_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset) + + return _new_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset) + + +def _build_row_subset( + row: Union[List[Any], Tuple[Any]], + sh_name: str, + sheetnames: List[str], + row_number: int, + row_subset: List[str], +) -> List[str]: + """ + This method will build each row and add an extra row for the sheet_name + If we're in an excel with multiple sheets + + """ + + def _infer_type(cell_value: Any) -> Any: + value = str(cell_value) + if type(cell_value) in [int, float, str]: + # we're removing "," from cells because we're going to be using comma as seperator for our csv payload + # and if we keep some cells with comma, it could generate fake mismatch errors on columns... + value = str(cell_value).replace(",", " ") + elif type(cell_value) == bool: + # we're assuming "True" and "False" will be considered as booleans + value = f'"{cell_value}"' + elif type(cell_value) in [datetime.datetime]: + # in teh context of only preview, i think it's okay to + # just have a representation of the date + value = cell_value.strftime("%m/%d/%Y %H:%M:%S") + + return value + + cells = [ + _infer_type(cell.value) + if type(cell) not in [str, int, float, bool, datetime.datetime] and cell is not None + else _infer_type(cell) + for cell in row + ] + + if len(sheetnames) > 1: + row_subset.append(f'{",".join([*cells, sh_name])}\n') + else: + row_subset.append(f'{",".join(cells)}\n') + + return row_subset + + +def _get_row_subset_per_sheet( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sh_name: str, + sheetnames: List[str], + preview_nrows: Optional[int], + preview_offset: Optional[int], + row_subset: List[str], + skiprows: Optional[int] = None, + nrows: Optional[int] = None, + skipfooter: int = 0, +) -> List[str]: + """ + This method will get an iterator from the workbook and + construct a list of row inside row_subset + """ + # we get the row iterator from here + row_iterator = _get_rows_iterator(wb, sh_name, preview_nrows, preview_offset) + + def __loop_and_fill_row_subsets(row_subset: List[str], loop_on: Any) -> List[str]: + headers_skipped = False + for row_number, row in loop_on: + # We want to skip the headers if we're in another sheet + if not headers_skipped: + headers_skipped = True + continue + if skiprows: + if row_number <= skiprows: + continue + row_subset = _build_row_subset(row, sh_name, sheetnames, row_number, row_subset) + if nrows: + if row_number == nrows: + break + + return row_subset + + if isinstance(wb, openpyxl.workbook.Workbook): + for row_iter in row_iterator: + row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iter)) + else: + row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iterator)) + + # to handle the skipfooter + lines_to_keep = len(row_subset) - skipfooter + + return row_subset[:lines_to_keep] + + +def _read_sheets( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sheet_names: List[Any], + preview_nrows: Optional[int], + preview_offset: Optional[int], + nrows: Optional[int] = None, + skiprows: Optional[int] = None, + skipfooter: int = 0, +) -> List[Any]: + """ + This method will loop over sheets, read content and return a list of rows + depending on your inputs + + """ + + row_subset: List[str] = [] + + for sh_name in sheet_names: + row_subset = _get_row_subset_per_sheet( + wb, + sh_name, + sheet_names, + preview_nrows, + preview_offset, + row_subset, + skiprows, + nrows, + skipfooter, + ) + + if isinstance(wb, openpyxl.workbook.Workbook): + wb.close() + + return row_subset -@wraps(pd.read_excel) def read_excel( - filepath_or_buffer: "FilePathOrBuffer", + filepath: str, *, - # extra `peakina` reader kwargs - preview_offset: int = 0, preview_nrows: Optional[int] = None, - # change of default values - keep_default_na: bool = False, # pandas default: `True` - **kwargs: Any, + preview_offset: Optional[int] = None, + sheet_name: str = "", + na_values: Any = None, + keep_default_na: bool = False, + skiprows: Optional[int] = None, + nrows: Optional[int] = None, + skipfooter: int = 0, ) -> pd.DataFrame: - df = pd.read_excel( - filepath_or_buffer, - keep_default_na=keep_default_na, - **kwargs, + """ + Uses openpyxl (with xlrd as fallback) to convert the excel sheet into a csv string. + This csv is then read by pandas to make a DataFrame. + + Using this two steps, we are able to obtain better performance than pd.read_excel alone. + Also, these two libraries are able to read only the top of each sheet, + so we can create previews without reading the whole file. + + """ + + column_names = [] + + try: + wb = openpyxl.load_workbook(filepath, read_only=True) + all_sheet_names = wb.sheetnames + + # we get column names with the iterator + for sh_name in all_sheet_names: + for column_list in [list(c) for c in wb[sh_name].iter_rows(min_row=1, max_row=1)]: + for co in column_list: + if co.value not in column_names: + column_names.append(co.value) + + except InvalidFileException as e: + LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e) + wb = xlrd.open_workbook(filepath) + all_sheet_names = wb.sheet_names() + + for sh_name in all_sheet_names: + column_names += [ + c.value for c in wb.sheet_by_name(sh_name).row(0) if c.value not in column_names + ] + + sheet_names = [sheet_name] if sheet_name else all_sheet_names + if len(all_sheet_names) > 1: + sheet_names = [all_sheet_names[0]] if sheet_name == "" else sheet_names + + row_subset = _read_sheets( + wb, sheet_names, preview_nrows, preview_offset, nrows, skiprows, skipfooter ) - # if there are several sheets, pf.read_excel returns a dict {sheet_name: df} - if isinstance(df, dict): - for sheet_name, sheet_df in df.items(): - sheet_df["__sheet__"] = sheet_name - df = pd.concat(df.values(), sort=False) - if preview_nrows is not None or preview_offset: - offset = None if preview_nrows is None else preview_offset + preview_nrows - return df[preview_offset:offset] - return df + if sheet_name is None: + if "__sheet__" not in column_names: # type: ignore + column_names.append("__sheet__") + + columns_kwargs = { + "header": None, + "names": column_names, + } + return pd.read_csv( + StringIO("\n".join(row_subset)), + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + true_values=["True"], + false_values=["False"], + **columns_kwargs, + ) -def excel_meta( - filepath_or_buffer: "FilePathOrBuffer", reader_kwargs: Dict[str, Any] -) -> Dict[str, Any]: + +def excel_meta(filepath: str, reader_kwargs: Dict[str, Any]) -> Dict[str, Any]: """ Returns a dictionary with the meta information of the excel file. """ - excel_file = pd.ExcelFile(filepath_or_buffer) - sheet_names = excel_file.sheet_names - df = read_excel(excel_file, **reader_kwargs) + total_rows = 0 + try: + wb = openpyxl.load_workbook(filepath, read_only=True) + for sheet in wb.worksheets: + total_rows += sheet.max_row + sheet_names = wb.sheetnames + except InvalidFileException as e: + LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e) + wb = xlrd.open_workbook(filepath) + sheet_names = wb.sheet_names() + for sheet in sheet_names: + total_rows += wb.sheet_by_name(sheet).nrows - if (sheet_name := reader_kwargs.get("sheet_name", 0)) is None: - # multiple sheets together - return { - "sheetnames": sheet_names, - "df_rows": df.shape[0], - "total_rows": sum(excel_file.parse(sheet_name).shape[0] for sheet_name in sheet_names), - } - else: - # single sheet - return { - "sheetnames": sheet_names, - "df_rows": df.shape[0], - "total_rows": excel_file.parse(sheet_name).shape[0], - } + # to not count headers of sheets as rows: + total_rows -= len(sheet_names) + + df = read_excel(filepath, **reader_kwargs) + + return { + "sheetnames": sheet_names, + "df_rows": df.shape[0], + "total_rows": total_rows, + } diff --git a/pyproject.toml b/pyproject.toml index 0715b42e..a62ca75d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "peakina" -version = "0.7.2" +version = "0.7.3" description = "pandas readers on steroids (remote files, glob patterns, cache, etc.)" authors = ["Toucan Toco "] readme = "README.md" diff --git a/tests/fixtures/fixture-multi-sheet.xlsx b/tests/fixtures/fixture-multi-sheet.xlsx index 3d81f7db..a0bf395c 100644 Binary files a/tests/fixtures/fixture-multi-sheet.xlsx and b/tests/fixtures/fixture-multi-sheet.xlsx differ diff --git a/tests/fixtures/fixture-single-sheet-with-types.xlsx b/tests/fixtures/fixture-single-sheet-with-types.xlsx new file mode 100644 index 00000000..79520150 Binary files /dev/null and b/tests/fixtures/fixture-single-sheet-with-types.xlsx differ diff --git a/tests/readers/test_excel.py b/tests/readers/test_excel.py index bd961623..0f66ceb8 100644 --- a/tests/readers/test_excel.py +++ b/tests/readers/test_excel.py @@ -25,8 +25,8 @@ def test_simple_xls_preview(path): "moins souvent", "jamais", ], - "part": [9, 45, 35, 10, 1], - "clients": [896] * 5, + "part": [9.0, 45.0, 35.0, 10.0, 1.0], + "clients": [896.0] * 5, "pays": ["France"] * 5, } ) @@ -44,8 +44,8 @@ def test_simple_xls_preview(path): "breakdown": ["Par territoire"] * 2, "catégorie": ["Agglo 1 2014"] * 2, "fréquence": ["Au moins 1 fois/mois", "plusieurs fois/an"], - "part": [9, 45], - "clients": [896] * 2, + "part": [9.0, 45.0], + "clients": [896.0] * 2, "pays": ["France"] * 2, } ) @@ -63,8 +63,8 @@ def test_simple_xls_preview(path): "breakdown": ["Par territoire"] * 2, "catégorie": ["Agglo 1 2014"] * 2, "fréquence": ["1 fois/an", "moins souvent"], - "part": [35, 10], - "clients": [896] * 2, + "part": [35.0, 10.0], + "clients": [896.0] * 2, "pays": ["France"] * 2, } ) @@ -118,27 +118,66 @@ def test_xls_metadata(path): "total_rows": 170, } + +def test_multiple_xls_metadata(path): + """It should be able to get metadata of an excel file with multiple sheets""" # with multiple sheets ds = DataSource( path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None, "preview_nrows": 1, "preview_offset": 1}, ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet assert ds.get_df().shape == (1, 3) assert ds.get_metadata() == { "sheetnames": ["January", "February"], "df_rows": 1, - "total_rows": 2, + "total_rows": 4, } ds = DataSource( path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None, "preview_nrows": 2, "preview_offset": 2}, ) - assert ds.get_df().shape == (0, 3) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + assert ds.get_df().shape == (1, 3) + assert ds.get_metadata() == { + "sheetnames": ["January", "February"], + "df_rows": 1, + "total_rows": 4, + } + + ds = DataSource( + path("fixture-multi-sheet.xlsx"), + reader_kwargs={"sheet_name": None, "preview_nrows": 2}, + ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + # the result is 3 lines here because we're previewing 2 rows from January's sheet (which is 1 as result) and + # 2 rows from February's sheet (which is 2 as result) + # 1 + 2 => 3 lines/rows + assert ds.get_df().shape == (3, 3) + assert ds.get_metadata() == { + "sheetnames": ["January", "February"], + "df_rows": 3, + "total_rows": 4, + } + + ds = DataSource( + path("fixture-multi-sheet.xlsx"), + reader_kwargs={"sheet_name": None, "preview_offset": 2}, + ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + # the result is 0 lines/rows here because we're previewing an offset of 2 on available + # rows from January's sheet (1 row) (as result we have 0 from this sheet) and an offset of 2 + # on February's sheet rows (3rows) (as result we have 1 from this sheet) + # 0 + 1 => 1 lines/rows (the line from February sheet) + assert ds.get_df().shape == (1, 3) + assert ds.get_df().equals( + pd.DataFrame({"Month": [4], "Year": [2022], "__sheet__": ["February"]}) + ) assert ds.get_metadata() == { "sheetnames": ["January", "February"], - "df_rows": 0, - "total_rows": 2, + "df_rows": 1, + "total_rows": 4, } @@ -148,12 +187,13 @@ def test_multisheet_xlsx(path): path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None}, ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet assert ds.get_df().equals( pd.DataFrame( { - "Month": [1, 2], - "Year": [2019, 2019], - "__sheet__": ["January", "February"], + "Month": [1, 2, 3, 4], + "Year": [2019, 2019, 2021, 2022], + "__sheet__": ["January", "February", "February", "February"], } ) ) @@ -175,11 +215,29 @@ def test_multisheet_xlsx(path): path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": "February"}, ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet assert ds.get_df().equals( pd.DataFrame( { - "Month": [2], - "Year": [2019], + "Month": [2, 3, 4], + "Year": [2019, 2021, 2022], + } + ) + ) + + +def test_with_specials_types_xlsx(path): + """It should be able to read sheet and format types""" + ds = DataSource( + path("fixture-single-sheet-with-types.xlsx"), + ) + assert ds.get_df().equals( + pd.DataFrame( + { + None: [0, 1, 2], + "bools": [True, False, True], + "dates": ["03/02/2022 05:43:04", "03/02/2022 05:43:04", "03/02/2022 05:43:04"], + "floats": [12.35, 42.69, 1234567.0], } ) ) diff --git a/tests/test_datasource.py b/tests/test_datasource.py index 04b1a44e..1867e1cd 100644 --- a/tests/test_datasource.py +++ b/tests/test_datasource.py @@ -152,22 +152,69 @@ def test_basic_excel(path): assert ds.get_metadata() == { "df_rows": 1, "sheetnames": ["January", "February"], - "total_rows": 1, + "total_rows": 4, # we have for rows as total here because january sheet has 1 row and February sheet has 3 (1 + 3) } # On match datasources, no metadata is returned: assert DataSource(path("fixture-multi-sh*t.xlsx"), match=MatchEnum.GLOB).get_metadata() == {} + # test with skiprows + ds = DataSource(path("fixture-single-sheet.xlsx"), reader_kwargs={"skiprows": 2}) + assert ds.get_df().shape == (0, 2) + + # test with nrows and skiprows + ds = DataSource(path("fixture-single-sheet.xlsx"), reader_kwargs={"nrows": 1, "skiprows": 2}) + assert ds.get_df().shape == (0, 2) + + # test with skiprows and limit offset + ds = DataSource( + path("fixture-single-sheet.xlsx"), + reader_kwargs={"skiprows": 2, "preview_nrows": 1, "preview_offset": 0}, + ) + assert ds.get_df().shape == (0, 2) + + # test with nrows and limit offset + ds = DataSource( + path("fixture-single-sheet.xlsx"), + reader_kwargs={"nrows": 1, "preview_nrows": 1, "preview_offset": 0}, + ) + assert ds.get_df().shape == (1, 2) + + # test with the new file format type + ds = DataSource( + path("fixture_new_format.xls"), reader_kwargs={"preview_nrows": 1, "preview_offset": 2} + ) + assert ds.get_df().shape == (1, 8) + + # test with nrows + ds = DataSource(path("fixture_new_format.xls"), reader_kwargs={"nrows": 2}) + assert ds.get_df().shape == (2, 8) + + # test with skiprows + ds = DataSource(path("fixture_new_format.xls"), reader_kwargs={"skiprows": 2}) + assert ds.get_df().shape == (7, 8) + + # test with nrows and skiprows + ds = DataSource(path("fixture_new_format.xls"), reader_kwargs={"nrows": 1, "skiprows": 2}) + assert ds.get_df().shape == (1, 8) + def test_multi_sheets_excel(path): """It should add a __sheet__ column when retrieving multiple sheet""" ds = DataSource(path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None}) - df = pd.DataFrame({"Month": [1, 2], "Year": [2019, 2019], "__sheet__": ["January", "February"]}) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + df = pd.DataFrame( + { + "Month": [1, 2, 3, 4], + "Year": [2019, 2019, 2021, 2022], + "__sheet__": ["January", "February", "February", "February"], + } + ) assert ds.get_df().equals(df) assert ds.get_metadata() == { - "df_rows": 2, + "df_rows": 4, "sheetnames": ["January", "February"], - "total_rows": 2, + "total_rows": 4, }