From b1ff48bedac492af83e0245fbe8e0757a9253cdc Mon Sep 17 00:00:00 2001 From: darker Date: Wed, 2 Mar 2022 10:43:23 +0100 Subject: [PATCH] feat: preview/pagination optimization on huge excel file [TCTC-1973] (#86) * feat: optimize the excel reader * test: added more data in the excel fixture * tests: split tests for excel metadatas * fix: fix other tests + clean/shrink loops * clear: removed some compact code for clear view * clean: clean way to handle skipfooter at the bottom * fix: remove another overdoing loop * test: adapt tests from excel fixture * fix: fix errors and adapt the code for read_excel expected output * feat: added more tests for coverage * fix: clean/refacto + fix some accordinly to tests * test: add more tests and explanations for the coverage * fix a test for the total_rows on excel * feat: completely removed pandas read_excel/Excelfile support * type: handle datetime * feat: clean columns extraction * clean: clean stuffs from review * feat: explicit iteration * doc: added comment for removing comma + clean * feat: infering type for cell * test: add coverage for specials types * test(excel): add floats to types fixtures cases Co-authored-by: David Nowinsky --- peakina/readers/excel.py | 343 +++++++++++++++--- pyproject.toml | 2 +- tests/fixtures/fixture-multi-sheet.xlsx | Bin 6021 -> 6164 bytes .../fixture-single-sheet-with-types.xlsx | Bin 0 -> 8757 bytes tests/readers/test_excel.py | 88 ++++- tests/test_datasource.py | 55 ++- 6 files changed, 422 insertions(+), 66 deletions(-) create mode 100644 tests/fixtures/fixture-single-sheet-with-types.xlsx diff --git a/peakina/readers/excel.py b/peakina/readers/excel.py index 94123e19..a88d5b4e 100644 --- a/peakina/readers/excel.py +++ b/peakina/readers/excel.py @@ -1,70 +1,321 @@ """ Module to add excel files support """ +import datetime import logging -from functools import wraps -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from io import StringIO +from typing import Any, Dict, Generator, List, Optional, Tuple, Union +import openpyxl import pandas as pd +import xlrd +from openpyxl.utils.exceptions import InvalidFileException -if TYPE_CHECKING: - from os import PathLike +LOGGER = logging.getLogger(__name__) - FilePathOrBuffer = Union[str, bytes, PathLike[str], PathLike[bytes]] -LOGGER = logging.getLogger(__name__) +def _old_xls_rows_iterator( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sh_name: str, + preview_nrows: Optional[int], + preview_offset: Optional[int], +) -> Generator[Any, Any, Any]: + """ + Depending on paginations inputs (preview_rows, preview_offset), we want to + get an iterator object to loop on target rows, here we're returning an iterator + using yield for each iteration in the workbook + + """ + + if preview_nrows is None and preview_offset is not None: + to_iter = range(preview_offset, wb.sheet_by_name(sh_name).nrows) + elif preview_nrows is not None and preview_offset is not None: + to_iter = range(preview_offset, preview_offset + preview_nrows + 1) + elif preview_nrows is not None and preview_offset is None: + to_iter = range(preview_nrows + 1) + else: + to_iter = range(wb.sheet_by_name(sh_name).nrows) + + for rx in to_iter: + yield wb.sheet_by_name(sh_name).row(rx) + + +def _new_xls_rows_iterator( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sh_name: str, + preview_nrows: Optional[int], + preview_offset: Optional[int], +) -> Generator[Any, Any, Any]: + """ + Depending on paginations inputs (preview_rows, preview_offset), we want to + get an iterator object to loop on target rows, here we're returning an iterator + from the iter_rows built-in function from openpyxl + + """ + + # +1 are here because this is 1-based indexing + if preview_nrows is not None and preview_offset is not None: + max_row = preview_offset + 1 + preview_nrows + elif preview_nrows is not None and preview_offset is None: + max_row = preview_nrows + 1 + else: + max_row = None + + if preview_offset: + min_row = preview_offset + 1 + else: + min_row = None + + # Then we return the generator + yield wb[sh_name].iter_rows( + min_row=min_row, + max_row=max_row, + values_only=True, + ) + + +def _get_rows_iterator( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sheet_name: str, + preview_nrows: Optional[int], + preview_offset: Optional[int], +) -> Generator[Any, Any, Any]: + """ + Depending on the excel type either it's the new format or the old one, + this method will return an iterator to read on its rows + """ + + if isinstance(wb, xlrd.book.Book): + return _old_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset) + + return _new_xls_rows_iterator(wb, sheet_name, preview_nrows, preview_offset) + + +def _build_row_subset( + row: Union[List[Any], Tuple[Any]], + sh_name: str, + sheetnames: List[str], + row_number: int, + row_subset: List[str], +) -> List[str]: + """ + This method will build each row and add an extra row for the sheet_name + If we're in an excel with multiple sheets + + """ + + def _infer_type(cell_value: Any) -> Any: + value = str(cell_value) + if type(cell_value) in [int, float, str]: + # we're removing "," from cells because we're going to be using comma as seperator for our csv payload + # and if we keep some cells with comma, it could generate fake mismatch errors on columns... + value = str(cell_value).replace(",", " ") + elif type(cell_value) == bool: + # we're assuming "True" and "False" will be considered as booleans + value = f'"{cell_value}"' + elif type(cell_value) in [datetime.datetime]: + # in teh context of only preview, i think it's okay to + # just have a representation of the date + value = cell_value.strftime("%m/%d/%Y %H:%M:%S") + + return value + + cells = [ + _infer_type(cell.value) + if type(cell) not in [str, int, float, bool, datetime.datetime] and cell is not None + else _infer_type(cell) + for cell in row + ] + + if len(sheetnames) > 1: + row_subset.append(f'{",".join([*cells, sh_name])}\n') + else: + row_subset.append(f'{",".join(cells)}\n') + + return row_subset + + +def _get_row_subset_per_sheet( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sh_name: str, + sheetnames: List[str], + preview_nrows: Optional[int], + preview_offset: Optional[int], + row_subset: List[str], + skiprows: Optional[int] = None, + nrows: Optional[int] = None, + skipfooter: int = 0, +) -> List[str]: + """ + This method will get an iterator from the workbook and + construct a list of row inside row_subset + """ + # we get the row iterator from here + row_iterator = _get_rows_iterator(wb, sh_name, preview_nrows, preview_offset) + + def __loop_and_fill_row_subsets(row_subset: List[str], loop_on: Any) -> List[str]: + headers_skipped = False + for row_number, row in loop_on: + # We want to skip the headers if we're in another sheet + if not headers_skipped: + headers_skipped = True + continue + if skiprows: + if row_number <= skiprows: + continue + row_subset = _build_row_subset(row, sh_name, sheetnames, row_number, row_subset) + if nrows: + if row_number == nrows: + break + + return row_subset + + if isinstance(wb, openpyxl.workbook.Workbook): + for row_iter in row_iterator: + row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iter)) + else: + row_subset = __loop_and_fill_row_subsets(row_subset, enumerate(row_iterator)) + + # to handle the skipfooter + lines_to_keep = len(row_subset) - skipfooter + + return row_subset[:lines_to_keep] + + +def _read_sheets( + wb: Union[openpyxl.workbook.Workbook, xlrd.book.Book], + sheet_names: List[Any], + preview_nrows: Optional[int], + preview_offset: Optional[int], + nrows: Optional[int] = None, + skiprows: Optional[int] = None, + skipfooter: int = 0, +) -> List[Any]: + """ + This method will loop over sheets, read content and return a list of rows + depending on your inputs + + """ + + row_subset: List[str] = [] + + for sh_name in sheet_names: + row_subset = _get_row_subset_per_sheet( + wb, + sh_name, + sheet_names, + preview_nrows, + preview_offset, + row_subset, + skiprows, + nrows, + skipfooter, + ) + + if isinstance(wb, openpyxl.workbook.Workbook): + wb.close() + + return row_subset -@wraps(pd.read_excel) def read_excel( - filepath_or_buffer: "FilePathOrBuffer", + filepath: str, *, - # extra `peakina` reader kwargs - preview_offset: int = 0, preview_nrows: Optional[int] = None, - # change of default values - keep_default_na: bool = False, # pandas default: `True` - **kwargs: Any, + preview_offset: Optional[int] = None, + sheet_name: str = "", + na_values: Any = None, + keep_default_na: bool = False, + skiprows: Optional[int] = None, + nrows: Optional[int] = None, + skipfooter: int = 0, ) -> pd.DataFrame: - df = pd.read_excel( - filepath_or_buffer, - keep_default_na=keep_default_na, - **kwargs, + """ + Uses openpyxl (with xlrd as fallback) to convert the excel sheet into a csv string. + This csv is then read by pandas to make a DataFrame. + + Using this two steps, we are able to obtain better performance than pd.read_excel alone. + Also, these two libraries are able to read only the top of each sheet, + so we can create previews without reading the whole file. + + """ + + column_names = [] + + try: + wb = openpyxl.load_workbook(filepath, read_only=True) + all_sheet_names = wb.sheetnames + + # we get column names with the iterator + for sh_name in all_sheet_names: + for column_list in [list(c) for c in wb[sh_name].iter_rows(min_row=1, max_row=1)]: + for co in column_list: + if co.value not in column_names: + column_names.append(co.value) + + except InvalidFileException as e: + LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e) + wb = xlrd.open_workbook(filepath) + all_sheet_names = wb.sheet_names() + + for sh_name in all_sheet_names: + column_names += [ + c.value for c in wb.sheet_by_name(sh_name).row(0) if c.value not in column_names + ] + + sheet_names = [sheet_name] if sheet_name else all_sheet_names + if len(all_sheet_names) > 1: + sheet_names = [all_sheet_names[0]] if sheet_name == "" else sheet_names + + row_subset = _read_sheets( + wb, sheet_names, preview_nrows, preview_offset, nrows, skiprows, skipfooter ) - # if there are several sheets, pf.read_excel returns a dict {sheet_name: df} - if isinstance(df, dict): - for sheet_name, sheet_df in df.items(): - sheet_df["__sheet__"] = sheet_name - df = pd.concat(df.values(), sort=False) - if preview_nrows is not None or preview_offset: - offset = None if preview_nrows is None else preview_offset + preview_nrows - return df[preview_offset:offset] - return df + if sheet_name is None: + if "__sheet__" not in column_names: # type: ignore + column_names.append("__sheet__") + + columns_kwargs = { + "header": None, + "names": column_names, + } + return pd.read_csv( + StringIO("\n".join(row_subset)), + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + true_values=["True"], + false_values=["False"], + **columns_kwargs, + ) -def excel_meta( - filepath_or_buffer: "FilePathOrBuffer", reader_kwargs: Dict[str, Any] -) -> Dict[str, Any]: + +def excel_meta(filepath: str, reader_kwargs: Dict[str, Any]) -> Dict[str, Any]: """ Returns a dictionary with the meta information of the excel file. """ - excel_file = pd.ExcelFile(filepath_or_buffer) - sheet_names = excel_file.sheet_names - df = read_excel(excel_file, **reader_kwargs) + total_rows = 0 + try: + wb = openpyxl.load_workbook(filepath, read_only=True) + for sheet in wb.worksheets: + total_rows += sheet.max_row + sheet_names = wb.sheetnames + except InvalidFileException as e: + LOGGER.info(f"Failed to read file {filepath} with openpyxl. Trying xlrd.", exc_info=e) + wb = xlrd.open_workbook(filepath) + sheet_names = wb.sheet_names() + for sheet in sheet_names: + total_rows += wb.sheet_by_name(sheet).nrows - if (sheet_name := reader_kwargs.get("sheet_name", 0)) is None: - # multiple sheets together - return { - "sheetnames": sheet_names, - "df_rows": df.shape[0], - "total_rows": sum(excel_file.parse(sheet_name).shape[0] for sheet_name in sheet_names), - } - else: - # single sheet - return { - "sheetnames": sheet_names, - "df_rows": df.shape[0], - "total_rows": excel_file.parse(sheet_name).shape[0], - } + # to not count headers of sheets as rows: + total_rows -= len(sheet_names) + + df = read_excel(filepath, **reader_kwargs) + + return { + "sheetnames": sheet_names, + "df_rows": df.shape[0], + "total_rows": total_rows, + } diff --git a/pyproject.toml b/pyproject.toml index 0715b42e..a62ca75d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "peakina" -version = "0.7.2" +version = "0.7.3" description = "pandas readers on steroids (remote files, glob patterns, cache, etc.)" authors = ["Toucan Toco "] readme = "README.md" diff --git a/tests/fixtures/fixture-multi-sheet.xlsx b/tests/fixtures/fixture-multi-sheet.xlsx index 3d81f7dba4ef4b6c0739fa98058184bc88066267..a0bf395c479059b05872221f21af21c5ea00ef7e 100644 GIT binary patch delta 3569 zcmZWsXH*l|5>7%5p#=h=HwlIsdM^q{G1Mp}pcHAM^crv>QWXSI0;os{MUmc{G^q=U zNRtv|L7Ma?NL_iL@4VH0_t%_z=6+}H%-rvO6NO5Wq&Gp6k;4Gg)YO2kV9TrJ^ejN~ zGoum$)dl^u7U=lNepm!5%5!)r37orIlewYqggj%@o`n~#J0B{nylsJWW~whxnYHA3 z=jt3BL2c5(M$QOG5<5AR0ECVYU%Y4J&luG?vo=vXT{4J5u|NW}T}Ad)(~M>$`BdRg zx7SpUx2leD?35(FhfwCU-gdHq&reos{ZPY|5@0DHz)Ot*?J(yL!!I=ovJ5iwgYTtl zzM^5bEO5_#Awb=r#quCgQ9VIKJ7^JF@==&A8+p^nL)*q8l9M5uT8naVCGqlPcGEOr zvrO}jUiG{T5jPvC+_;nnwYlVrs6zG%!p*eY)8@_mc7;BOXhc~?4f~2~fXX80GyysBS?xP2p3uhui8gN%)|hD7Hqc%|H+VIoyf#lpUB zuIwCI-%v|ypGsQYoem_T%WXj|sr-{|O0$`csp*85Jdt0S-BuzhDmVb`!tuy|?D z{6Xi=_KlGqJh@pt!rXVvP-xS+MN`fSom0K3JoT8{etKh{)}8T99=s_g^93=k?X8^1ARsRGL%Xt$1`fe)OS^^KewS!7#70viLd5vmd^0Bd8?zmEU%|_^AKM5 z7Mfqu7tbx5ajmZQdXPT}vBRw}?$UEW4HbJ|1 zsj#IgxL*fSP*whZySYSu9k+NMGm)9|tX|@$8k93vo zMKGjHYP77{nxLtuSJrbI4#@xjH5vfmcN#fm3K9G%MJUoXZIt$J_79C5mZj@stkXxZ zo&iGfGXA8#9Ub(ePB5ZC(CN>ET}WbLk){6myhA=TfLc}cE}si8FJyh~f{4&tC9p1V zi)f(z_U&5Q9}Tw68s{?xs|TFxDhK(`TjdGRSZ(JN(F$G3|iD3E|BnF>n3HeK7~{wU}H9;&5LKxshC(g;L?w zl{KjlLd(@bwU?xsrHhR|seQJJ?E9jY0#QuQco5k)iFB2Xo2ZO=te_{*+~mN-Vq?HN zs>hwp<p{b2evsS!QzC_mGT)c zXxU}W_5>Y$w}l&zRi#)*rd1-EmYeQ~z>g4Zg-T}blA^~J6+6+%o7WD_R^j2c=b&R) zL%+g(aJQxCBe&=0+Bm!YYORLeb0`XO41RxYR;;9h<&BM>PLF^|Tlu<6+9~qmtqj7a1NcwP^~M&7hx-Ee&M)?#B8m8rE2NAWl< zCqGYI_dcD$V>j05so2!`AtE2or6qc<)K2Wlg!t|-${ITCPa*U0Xv5t-rN*Ibn2x!- zjUi>3%R4>3>&rk8o1sc-8FCbFhtyre$zJSaHqT*^JBH})(5>E`1wme++NmWpPT23_ zqH{%#WOsb5gs;5ma>a;>$5LX?y~Czz9^AObSw&3nWiotB41d<+*wx=^4n9y{P^=cL zpZ1u-7|uR}-7&VZhEIG}r3~cKCS&V+J>TI!1R(KU?48h-X^Lu!vGK+iYc5I9JAdz+ zzAdKCAl}y3rrXcGz4$>OL9C>_)6DAH0C*cN7QQ*-v^hH!p-{YMy;Hi|QMmah>$Nd5 z1@$nEs{7%KEbHg-9N!%Q9zG!a&9KcJ=KWAlva5;7!Di#W1Mj>UXR!fk0~4*o>w-93 z)fxfPlcAk2Y&&fasi?*0Y;_T_(pUG_$qzN>Q|#?LbnlGC1wXm_YV}0tW^Qlu!nd>w zu{uMNsthkD6L8NMN%`5XD_qbK7Nd_ebE#_eF~neIRR@f#OrGb;4F@=LAgm^*{9rYj ze=BYxxrz@{j@s}=T09jPF_G@=w?QrBGGWkfy`nu84SCd)mlE)4OY57vo6-X+Jxpnf z5Ywl3Vx{ky##C6fZo{rmT;m?MU}+b3ly=3cUuH6uDoelXsQ6b(iU7(98=Z`-rt|6; zjplogHqQC*%?>394Q=-kploB|Hl=T^W;1~H3wmzZ0V=aZ~+P8 zjobNPA$u0GjkQ(7V$oICvz?74q{idA6e1hw#ghQmwRn_!?d#5_$*-;TpCFyb#MGM% zBR%F%kZ24nY@ztkYl(r0V5kPMJLo0FZ7Vv2a=6yl<&x>>?UA-ZcREW{>m`0YmA8g5 zHGgr&md{$7^defbBI;a=Z>V3ec4*(nGZX>|vg!5|8Q=x>butGTMD~^3=_|S8EkcEcF7Gvb|w;Omk)Eoo{v;2)-xCxb(O3nO7Iqa{?7j0vbiYQ)FJ=K z%=>~Fzv4XWQ%`rZX*A<4;6rQIKP(TBHc+5~!DvHSCXL^54Tl)>N6%*aK2$p=-d*ti zpAOx1t?BX?KgK+0qxR$47i;=r{9%QnK<~0%TH@1BrIqZ6Naq8WsU2mzMK^oYQ7L9# zYY6ekky)3|Eru)GuSMhwmSDdsn@nM3v7Nnr2wnf+(3n+i7 zc+r5=knzbvPWX*}X_d;(TnJ8hG*=Iq#?+;<*u%~=zOC|&HRSGn4$Nb)#O~aEHu)r> zAaYSmvipXnb#Dqztu}RPUlflUxP9xG^DH;1<-6j`r#BBi&Cmb7v!Ak-P{BDmjt{n1|&?|K=bkG95a1TKK0uS(j#K4OFt^$vR*1@bDI4rin9s;K`_8S zlZi9o2qycJ9e{8S5Ic~-4d=E#8&%4p?inqg_Omi806_Ok7Xtu}r{VnZxmS+$1pH;y`FlOQ5{So6YnPdLP`dW+tcBnXebh$unuh!DL+?;@hD zURR5#AyJlSKa%&INuD$FojHG8_c`CW?)#eaNfZhdq|`wWg6IGwBqV@R-$1Q0N_rsZ za)W81}o%JLO;;3)w#H;Cg$BVwIRQax0Rxs7v9X#UI=lpGE z=!0ncI5%8bH9bSDj>|4Rjs%{fupW4?dTaL$EPnScm8A&8!Cggltd9kq7Tr0C@X%$m zHNZT>dnp$@BjjCJG~bwyvO%^HOKuljS0z2PnnxqD55y3(D&LoNsU-L|!D?qO3Ukf{ zJ%n~DGLLNsZtpf-y`EsP)y4DHg!Mi+=HvGsAe>14@O2A^cev?*8WOKudd zMONP>%^F3^P1)zaa={0<3gFuugY-Q;02@|0SCkUyl(T$qg@<)H=SmQNn)~zz*Ewiv`^0)Sc}}fsrEi4-B(A8xQ_eL zsN#ouh9;_=uq%!5naugvFB|Wm@Z>sUdSd;4Q2q?TZZtpY5c2$C@DdUNMc#QKuG9h_ z9=)@f4`OjaHd^P_^^Jc%vi8YAzXyHyAiM5eyUX=f>WS{P5K6*Fx%iVK>qdF&@j-OlfNdmf{lvx|7Lfbj2CVCWSWOAaJyu1XE5J%YseR(g;qh`i9E6#69n@-e$6#Q>OyHYxA^YUDGn4fFr6np z%wAzGpAjk|-mZNZ-rlM?6<;uF5h^CF&<9v~hK9mX<0fsxtBjt8yl$cPSv#_I&_l*b z2x02By_F9|5{rRx-E43$mLRRz*c-|AX(fA-SD4FOXGqJi8@Q0jy6B(Vz=rMN70R<` z>7ae?dJNNDTM24A`C6j3_g25+W`TS>7WQh-<>bd#UGA^&K%I>7Jg6tix^SfN%7=ys z^sNM3=x|NmHKx{yuMdJO+!KTE1Kx6iuptzu7xQo#h7Iq9I& zD5`u>teZ+yuIkFXh%DI(F!r6XQfXq#W!Vi@jW5$;HIXzh~(a=+r z_oCWkP_&y2x5ln2CqF%6&u9>HKQuP<%a=8I=9zubQR$eLwrS+lbc)LY3%=RdT2~lX z)JXhBslqq4NquE0Tq#NKHo5gKyQIkGLystiy)e13dULLC`fg4|yIE<)lRrCQ69H&a zY?E{yAi`v->HwA^^|2nZ45@j6hL13NuRcl!+a*<}y6@$`&-!xe-_tZY6GMgQ5X-3b zh;XjbkM@5PCHgR;tiNSlvVOrf+VW8~Gh)k2@g#PzT?!!p;6w7?h+|$7@uMZk8!y7UI(UCP zAxTR8zEQ@Qk&6_j>y^Z8|DMV}ah;^f?_k#ANBdy)K+c2gaIe5pP3jK+fM7uWHd57ax6n$$^Mj90Q|XKb#UQ$v&2Ji^qBxqGZ^^%b$Fie zzB1if!}G}1h$4k~Ej0TBWpaVh+t@0CyR+Gms_M*{D)bf{do%(gB^`0ft?spnzV}0f z?z|w-BFBz%7J8=SQ7$0C;3c9FlzP0ELM_ouVGqNYdDDn9Zl>2*%wt+NS!HX1FOQt2 zh&SY^C2LTMS%79y_>;rfRW>a#Ux&B=Z+pPB^d8OB6~T=NgjF&m0lk^v(0L*xp`g0hAtk_jenjzkJZWAyq`a2B>?hqh+Z%-?QK{vK zyFEN$H@YzkC(5 z0pYIr_O7CWRXix{xQjkTyRdhkQy7Lwnj!w=wzt|u;3scj!i^c81k!KMC8eSK<+tZ! zd_1q99)Y)LpP`&Gr>)%}TYRl!OH0|^mG~ENp4zfUnFZW26A?GK%&2!qsALm-ejb32 zZ4;lP9|~I_s~k=gA~NW>#|Euxh@EO`%#XKkDf1q{LfHdChCw55i(O%mWT>-JP*?z$ zeJ^!N{T#D)fB%4uQ#)(&#L$dd1}Ey2*tlN>lK5jnYmKOlIx7wvHKr@qoC-;^p|J0J zebW0zm7>mEgK0X%^`Hk^1}fp~pe{jqxkS zv$Fj`EHm>a%@}%`l5O*Uo|zuq`zn6Vk^bKy>W;2meaa(?&?3Sw;Bm+ak zk^rg{3LSZ%_4+G*jsBGu(PY5mR_=+yK&^G^P55K8k*wPMuBTlZcIk^iInz!)#w%t( zCaUn4n%7w5!^M;+MS9TA*~`6LFmH#3xpj_onD}%PPtsLqryF1@Lz}WsQf`zGG^iHa z8B*`@HHxEr>w8t@*`hzVu#gZ~U|H^ii}tA)q_Dg4!neas+DKkxNkGOmlwy;U%cAnD zjHyN_jVP)JrKTA>(5Tp-m`4}Pj0mMB6ZQfs+sD(I#JQ$c3!y3d8F8rS)vNU56uWS5 z9T{-A_SCslSq-Jnfk#WUpm?|}H{89N z8E{x?mxX^hnDAOCOYVR0Jsjs6f&~@68%YOG=GVV2r!O}(BGY`T@;jG;2q?z!f_BrfQlJ&#_(JC=Oc0LFX0gwlg_~P>$E>7?QhXIDF(^F z_4_x@=^+FY6c{E3re8(H(&+<~7uQ!`j40J#C!bPd=IB|VmoEJ2)z!a5YM50K*x5jFq;fCK;l$N;h;uo&7f0KkVw000&M3HG(9 zl_kW`5~8i*WNiq3%j{@jPMY-ymNo+b3%&op_CLG=#W8a7?X2j27m{;iC%9CWnxwu- zkk%ngdZnj2D^mIs6-+~wl{LmI)kvA6-ZNoL-DwxijTHjV%J`C*$g+-0vD@`X*{rK3 zy9T6>x!n4!#Qk$p)V}f4c0N-KE$R;$fe-`OOYJp+<7nrkg6xn}+u9JV8^sXHL#~|H*^kFpWt|NR*_T3%H(_0 zuDuTMW*8$dm;1udg>G9W+H%^^T7BS`9rQ|+ODQraz3MwW8de(l8-_?YtAlStq99&68br|9&{OZ46^>L5hoI(V9U zw(oM^GVZsU}Gu&AY}hqNXrJbE@D4E~sR(x>H``h_ry# z9G3O~i&YLpY)v#1G*FegzlQ@T{*9Mas;m^>p);~j+95-Escma$4rXEgt^bdY|HCBw z%c~bh$tkq6qWB+5-UReqOw4~ke<|Z2BH2Ku;_fLmhgKDqO-VT4LPLbE^2`TL!lTLk zrf+s$AZ(|H?0kizI0O@$pQ75K#6SMp#txC5+BWW$P4RLEhW*6Z#Cf8Gv=gm;Qv`ic zZ9%3??;^#kv173c^dV;TXOFRRiTrT|6TP&$0OI>@+$u5C08A!@Q!Mb`9Bc!wV9nAoEdQ-ps1rt2G0w$^7h z!#YN%z(vQfGHUIXH}>MKxJ4lP^Aw+1rZdH&rSs*?M+o^fkFv0o{Ul?G4f9w4GYe5d$LW~K zquJ<{`OkUKdaH0eaHs=Z)cWoz89XA^fnR8^tT>+JPE1GxT7OU`)ogAo^%_#!-jy@G z?I_%d!=YB@eZ^aJCV-QaPe#%Pvb@x7@08@6D8}HGB#z(_HCH%acgKCLe2Pc~Y6A?m5fUx0#53i$`FTXMu9^d3msiItVselRPI=HY^s}7;MOl8Z zx+g?jh#bkeSuZdu{h;kru_WbU<~jXVMtjUU1Idzz+~M@?ASOO<23KgoEn;k+6cdRu zbVxw5m{ReX`%xqydyAJ4L0CsScKU|Kh2&9on0*lcGDPuGrJWBg-?Zk-SaS}`0yak{tY){p#EF_WEN)$ro@0F=rpbGp?755T~i9dzT)I-($82||es^Y)9 zg1;*IpKbvL8m2*s^1t1RL2@!3tmsY1cLA&p$@VxH(-0Q&J=I+_q~1!pS!yzN_wS2@ z^!0BwzR9q_S@}C14s_dJa6Vmu!#ZnaCg}Jx0 zTeOe-GzpAVCluAgn1;=>aY;)>5RjcmF7~}{)Stt2kkyXO6g*7a2Q`cRm1~~YzRG(g z-jy{qq)Bu}>-Tk{v*!ru-cG=gQ-a4*0;o=ak*#quhRas*bEK2T8zVWVL9^EEYm{}O zA{mlGPC-|H2x&l%1ki&n3cQ|`*r0m240-vZkFI6*gt7noq0w4z_rxM}r~j!k-gcg0 zhfsAHgIYy==pBDpMeut=LkO7V=kWYD^Y|0|vrCD7Yc)%X?SDwKFHCxX6}k%v!^xjf zQ{A1fJXlRyTE+K6L$i)Do$*+FJ}8m9M_y>y{`^RYrJIQ5;Nn#@(`80V32nMjjAv}A zn*G}m;3AO5A;y>1oT>7&;s_qnclWEidA-dW>PkQ)X{^>ceFeUWM%v4r}kjDsA5 zgc!#YWtv+8fqX%sMow|_Y!orM6q(;AS>KG&K1P#2M=&QMBdH9my&o=QCNg>T&WF9t z>e`Y#bsN0`q}3V!D7T~M#Ka;Mk0H$I(20z4+NBZfY8lRDa?{pJyZzQ$E`>ZX%%xe@ zV{k*Z2Wy+{zKFF^host+=L$fSI=-Flld& zX!CA#Jxk58O;oKzRCy$#(Uq3=H8be;Mx`9NOb@ejhk&C8X{(ivpbxx0XQXlID@uSV zss)XaLrsy>MRtST>C4+ZXwXnbJX49@bxYKAs|;%ke$2#Z3SXQ%KrXN&9I@x>$sMc3 zeXWOm2TG{N_%VK{Li{>(Sn-^U@6?QF32~bC%QWSeJJO|7lgJ^VxntLJVJeyM`(Jw( z8m08a2O0&GkqdfGRPJz@>Q034V4PQ?G8|l+X%)R@5RbEVXN>DP?bK5nl}(|PGi zqc3M0wWWSBym8nTge~Z=E?e;`{zV-?l5Ika&^#_N<1==pcA$~MT8mf%jSf}8h-T{7 zgg~r3POmBJbG!d*@ZXz~`=c|2{aJT89!eLng)s~A?fzUk|49{m3IH~T~< zDXVZM$0$AYl|ijXN{*pIm2H?^9YotFJLFj*bAtROD^L1~)D;!gRUPZQSEhQ>rWk?7 z#W^41(zRh%w`Qle;y3XO@r-lw;--Exi2o!>D;|2YJR$(lMf%&h_a{jp?+q;sS$_6E z`LU-t5JJd}-GX;5i~@GJ;arWNT38w~kDjMANR1(?s@_vlV`qzMB*cQH;ynY==H$k| zv=WRNeF2Bs^gZG+W#u+-gl1rhqGX*i*<4!79yLVbt|>qNBF)|T6mmS;l;A9pfZP@p zr`@c0BA607l|VTb=V8V}%i*dZ854?O_}KCanej$uHr?G8!=e|ddkXGK%sev=D8)NV zkK2jlA6fgwh{tP@I<#A|jUpYln~2zFApw^m7gT~z5q0^_TTju&jg5?A4>eZW!{uY6 z!t(RTm#Cq%i|U0HRGAg4?Vnx-`fY&@h{h zYxzMJRq=WvvqAk(wI6q3So`hgI^x0zzsU?0k8*z?O8t^Ja+wI7(3|#vRHJ<3-N_7W z?bq#-GW+ioYrdN)ib>hl`ZN1gnL5Da&e*bFXWK z9j zy~SJ5Zn-cPbJKI!`acdtoz+U&~r+h&P0y5JiHTFj>F zfyZ@};fZChD&XhzL+$Y$_?8Nj`j3vR(`4tPPwRpjf`%i_4w$8j0T-L>8R{#b>9BIc z4b))Fpgf+|QRS8j(NN-|2GPLvNkO(1+47+82k2!!fn!vnwmMWG=1Nh$!2UKvQs0@A zfdz}Q_5!mBL01C`UFdVLx(9w-@9KZgth5K>6?@J}+#F)C#%Al$DnVV!LMi4POm}*% z&*&?&!547$Or+%r%Ieb}ByT&oqGW8hM(aZG*&_w*r@|Bkz%#N@Jss}UPszk19ekwl zTqCCW#RNr(6hA$mt9e}Ac56gl3%_QX6hjkuZ8%6=(HNT81jK`Qmxmb62vkp14Fs|C z^PPT-ooT>VhJ<$EyVMAQHHLNB`#0-G)X8;XiT9+78F~g)&r)6LuZA7BWUR3BSb})zBif8-}E5 z9Pf;K;QH2@!QJX}w~qFL;C}NX0#{Q4(({`00dyrZZvkC@q$enN^WK3tcW=Z}i?qxS zVv?tlxzOOSU#T@=MX!Z*4h4OpxpC+taex6Mz(>43{t?5MOl!C)=JomU1(4Cq`F+~l z7wf2E@HZz_@Vkl2`yYiyWAB8@3AWp1!NY};@m=>V6$j0D z1ls8_d|g>1LM1zVCGDp#vELc{zMD?e7=dVvDDK2(YQ3^qf5K{kgX6eFwa++E%YHH=-4t~q4DpRBydI}D+E37U2 z_DM&CI*Revcv;G|_5@OCmX{8?aRhju0>onY-bBmyAv}*vaEa+)(q{6AMdcVF%CcZk z_e2-cT1|Sv08F+)hw%3}vb9o}_|M}jrO5-mmX%Ab?+QB?iHj+xiQ1ZgVG&Jme4E@D zIT{))42a=Do*_QAI|I+XlFM--*RLE_bo$;;HU^LK#)MPwg(%u*B^ac&aR!xs9Z_Ha z;wwxiwPx-z3nUc6oi{1Wa|0*(zz8NA#(ACeJL8b(?EwBY)S?s7@lW)*mbdauwxwpw zbx$!ToTM9<$K68UT3MOWzp}|`Hp;gLfBvjATglFdgrYFcb!lPuibwFclt#EG^{bZM z3&vgP)XVF>xyA`2LvsUK*x*cRF*# zdFd&pm}j~z=qf&-OpNLxKYM(&{@e05J9hMFOFEHCZ=R0rUcD+(DeS0w>}KoiK9!*I--6- z5}R9?FqgM=-#w>ITE;1+=F1?Z49Q?}PH%FHuH5*t-5w5Fh4D34u4K>C)SZjtrzsqFBrcHIVdrw)t3w>Y1KKD-SzjfS-U; zsgPyvXwK4$DrDoF@AcND6lG)j=*EH}6dE5@i%h|AwHgzg4BNc(RPXdaX&(Src9OcG zK3kdxsqR#xC}!6k#jdn6rc`pfv8F6?!A3r0GBbw@Sj(hVf!1*`!3_HZijRZW7^9G8 z(rYPFaFV9=IWa7H)lkv7^7{R1d{HgEjmx z4fD;R4U6uMMP!x$+FW{PtwsXg;|8rtMU<{9CBL3RfIp*`J*P@P5!XVwy8uUqmv8dK z=KkRA@^`gpQT$FU#64IesZ0nwx6}CIs{=XBHWLd{;6OG14-*x*dt^fth)bMGsCg{q z+zzj^tl%=dXMY;`XPet&wp)dwD=0#5s8;vnB@kdVk;>OEsNW?|Hwo^h9oE4q=}k-SUX7>Y zD-j+G-iN0YPPvW{q+2BgL9{ON(Eey$He4(0 ziChUOrnyJ7uUsdcmqJVOC+&ZYa(SFSj1@y?mY^3l^nxA*!S8i#4GokbwkDRwKiOsw z-Wm-199`@X`A&4J!AJ#NGs+SD2KLsIXN=D)$0g+ekADkThX1!1o_M8=J^*T7aG{0-1DYE#u+mqwwXz1Y=v&zu{!zL5FC_xq_E3D!7FwcpB8M zw|$YBjKU$gS1uu=6juZysq)Q?0CLHG4f@0lIcgjeIQUrOFUGtx`5t$f6{RRMt!wE@ z#>vA4%w3X1$ZrHsJ5%NFk774fx_;bSR(pSn=LK?tk$v$(oS9M&hUk=?1OX3Xuz_L| z(S_9NxH2T;W(=?9KtIZKzmY9+yZ?b`VNwJ4s52Dr!y}0t-9E+vf4)`hq?l>K(}ue{ zeHW&g!h*uag43)ST_Y;BtvA~mfsF6# zm1JBg9dh0a(;yxAjBZoa{C=i=Sm6OCzYRHrZcZ>%gVF!0y}H)c|0yqYVg6_-AAXfN zXfHSkB(ai4Wmje1Q1gV&XOHibYw=6Bjo-PTV8Pq08U3gNwp34tin7Rmuq7>C|1=5$ zIGe)SSGmp$Gh62Lbn31;zbsgPq-nXg?&l58QfUtx)X!OTv@%yAxeuaW3-#t}Y=*^-Jjn zWFe)z>cD^zQI@m{G^AIz2haHfN_`B%lCt@qWwi&%fWLQKc8pooFvB5x*iw@WmR0qu z=9X36&78bt-`Na0hZ?cp(-bhUOi*L}?*;ll7v<0KU&{50GJhBF_e$cQz~9CM=wJS& zuJ{o6u)6XK+62ukKB%=k1phs6{R;{JWI#i{|9=+yp`3@w%wLkmkpJI9{3EsbP|Cw> z&o3#LPk%~znEQDs;9e=L{O#NS0oW-Y`v3p{ literal 0 HcmV?d00001 diff --git a/tests/readers/test_excel.py b/tests/readers/test_excel.py index bd961623..0f66ceb8 100644 --- a/tests/readers/test_excel.py +++ b/tests/readers/test_excel.py @@ -25,8 +25,8 @@ def test_simple_xls_preview(path): "moins souvent", "jamais", ], - "part": [9, 45, 35, 10, 1], - "clients": [896] * 5, + "part": [9.0, 45.0, 35.0, 10.0, 1.0], + "clients": [896.0] * 5, "pays": ["France"] * 5, } ) @@ -44,8 +44,8 @@ def test_simple_xls_preview(path): "breakdown": ["Par territoire"] * 2, "catégorie": ["Agglo 1 2014"] * 2, "fréquence": ["Au moins 1 fois/mois", "plusieurs fois/an"], - "part": [9, 45], - "clients": [896] * 2, + "part": [9.0, 45.0], + "clients": [896.0] * 2, "pays": ["France"] * 2, } ) @@ -63,8 +63,8 @@ def test_simple_xls_preview(path): "breakdown": ["Par territoire"] * 2, "catégorie": ["Agglo 1 2014"] * 2, "fréquence": ["1 fois/an", "moins souvent"], - "part": [35, 10], - "clients": [896] * 2, + "part": [35.0, 10.0], + "clients": [896.0] * 2, "pays": ["France"] * 2, } ) @@ -118,27 +118,66 @@ def test_xls_metadata(path): "total_rows": 170, } + +def test_multiple_xls_metadata(path): + """It should be able to get metadata of an excel file with multiple sheets""" # with multiple sheets ds = DataSource( path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None, "preview_nrows": 1, "preview_offset": 1}, ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet assert ds.get_df().shape == (1, 3) assert ds.get_metadata() == { "sheetnames": ["January", "February"], "df_rows": 1, - "total_rows": 2, + "total_rows": 4, } ds = DataSource( path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None, "preview_nrows": 2, "preview_offset": 2}, ) - assert ds.get_df().shape == (0, 3) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + assert ds.get_df().shape == (1, 3) + assert ds.get_metadata() == { + "sheetnames": ["January", "February"], + "df_rows": 1, + "total_rows": 4, + } + + ds = DataSource( + path("fixture-multi-sheet.xlsx"), + reader_kwargs={"sheet_name": None, "preview_nrows": 2}, + ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + # the result is 3 lines here because we're previewing 2 rows from January's sheet (which is 1 as result) and + # 2 rows from February's sheet (which is 2 as result) + # 1 + 2 => 3 lines/rows + assert ds.get_df().shape == (3, 3) + assert ds.get_metadata() == { + "sheetnames": ["January", "February"], + "df_rows": 3, + "total_rows": 4, + } + + ds = DataSource( + path("fixture-multi-sheet.xlsx"), + reader_kwargs={"sheet_name": None, "preview_offset": 2}, + ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + # the result is 0 lines/rows here because we're previewing an offset of 2 on available + # rows from January's sheet (1 row) (as result we have 0 from this sheet) and an offset of 2 + # on February's sheet rows (3rows) (as result we have 1 from this sheet) + # 0 + 1 => 1 lines/rows (the line from February sheet) + assert ds.get_df().shape == (1, 3) + assert ds.get_df().equals( + pd.DataFrame({"Month": [4], "Year": [2022], "__sheet__": ["February"]}) + ) assert ds.get_metadata() == { "sheetnames": ["January", "February"], - "df_rows": 0, - "total_rows": 2, + "df_rows": 1, + "total_rows": 4, } @@ -148,12 +187,13 @@ def test_multisheet_xlsx(path): path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None}, ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet assert ds.get_df().equals( pd.DataFrame( { - "Month": [1, 2], - "Year": [2019, 2019], - "__sheet__": ["January", "February"], + "Month": [1, 2, 3, 4], + "Year": [2019, 2019, 2021, 2022], + "__sheet__": ["January", "February", "February", "February"], } ) ) @@ -175,11 +215,29 @@ def test_multisheet_xlsx(path): path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": "February"}, ) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet assert ds.get_df().equals( pd.DataFrame( { - "Month": [2], - "Year": [2019], + "Month": [2, 3, 4], + "Year": [2019, 2021, 2022], + } + ) + ) + + +def test_with_specials_types_xlsx(path): + """It should be able to read sheet and format types""" + ds = DataSource( + path("fixture-single-sheet-with-types.xlsx"), + ) + assert ds.get_df().equals( + pd.DataFrame( + { + None: [0, 1, 2], + "bools": [True, False, True], + "dates": ["03/02/2022 05:43:04", "03/02/2022 05:43:04", "03/02/2022 05:43:04"], + "floats": [12.35, 42.69, 1234567.0], } ) ) diff --git a/tests/test_datasource.py b/tests/test_datasource.py index 04b1a44e..1867e1cd 100644 --- a/tests/test_datasource.py +++ b/tests/test_datasource.py @@ -152,22 +152,69 @@ def test_basic_excel(path): assert ds.get_metadata() == { "df_rows": 1, "sheetnames": ["January", "February"], - "total_rows": 1, + "total_rows": 4, # we have for rows as total here because january sheet has 1 row and February sheet has 3 (1 + 3) } # On match datasources, no metadata is returned: assert DataSource(path("fixture-multi-sh*t.xlsx"), match=MatchEnum.GLOB).get_metadata() == {} + # test with skiprows + ds = DataSource(path("fixture-single-sheet.xlsx"), reader_kwargs={"skiprows": 2}) + assert ds.get_df().shape == (0, 2) + + # test with nrows and skiprows + ds = DataSource(path("fixture-single-sheet.xlsx"), reader_kwargs={"nrows": 1, "skiprows": 2}) + assert ds.get_df().shape == (0, 2) + + # test with skiprows and limit offset + ds = DataSource( + path("fixture-single-sheet.xlsx"), + reader_kwargs={"skiprows": 2, "preview_nrows": 1, "preview_offset": 0}, + ) + assert ds.get_df().shape == (0, 2) + + # test with nrows and limit offset + ds = DataSource( + path("fixture-single-sheet.xlsx"), + reader_kwargs={"nrows": 1, "preview_nrows": 1, "preview_offset": 0}, + ) + assert ds.get_df().shape == (1, 2) + + # test with the new file format type + ds = DataSource( + path("fixture_new_format.xls"), reader_kwargs={"preview_nrows": 1, "preview_offset": 2} + ) + assert ds.get_df().shape == (1, 8) + + # test with nrows + ds = DataSource(path("fixture_new_format.xls"), reader_kwargs={"nrows": 2}) + assert ds.get_df().shape == (2, 8) + + # test with skiprows + ds = DataSource(path("fixture_new_format.xls"), reader_kwargs={"skiprows": 2}) + assert ds.get_df().shape == (7, 8) + + # test with nrows and skiprows + ds = DataSource(path("fixture_new_format.xls"), reader_kwargs={"nrows": 1, "skiprows": 2}) + assert ds.get_df().shape == (1, 8) + def test_multi_sheets_excel(path): """It should add a __sheet__ column when retrieving multiple sheet""" ds = DataSource(path("fixture-multi-sheet.xlsx"), reader_kwargs={"sheet_name": None}) - df = pd.DataFrame({"Month": [1, 2], "Year": [2019, 2019], "__sheet__": ["January", "February"]}) + # because our excel file has 1 entry on January sheet and 3 entries in February sheet + df = pd.DataFrame( + { + "Month": [1, 2, 3, 4], + "Year": [2019, 2019, 2021, 2022], + "__sheet__": ["January", "February", "February", "February"], + } + ) assert ds.get_df().equals(df) assert ds.get_metadata() == { - "df_rows": 2, + "df_rows": 4, "sheetnames": ["January", "February"], - "total_rows": 2, + "total_rows": 4, }