diff --git a/peakina/helpers.py b/peakina/helpers.py index ae1d6c2f..67561fe1 100644 --- a/peakina/helpers.py +++ b/peakina/helpers.py @@ -26,6 +26,7 @@ read_excel, read_geo_data, read_json, + read_parquet, read_xml, ) @@ -72,7 +73,7 @@ class TypeInfos(NamedTuple): read_json, ["filter"], # this option comes from read_json, which @wraps(pd.read_json) ), - "parquet": TypeInfos(["peakina/parquet"], pd.read_parquet), + "parquet": TypeInfos(["peakina/parquet"], read_parquet), "xml": TypeInfos(["application/xml", "text/xml"], read_xml), } diff --git a/peakina/readers/__init__.py b/peakina/readers/__init__.py index 9fb2d04e..58873738 100644 --- a/peakina/readers/__init__.py +++ b/peakina/readers/__init__.py @@ -2,6 +2,7 @@ from .excel import excel_meta, read_excel from .geodata import read_geo_data from .json import read_json +from .parquet import read_parquet from .xml import read_xml __all__ = ( @@ -17,4 +18,6 @@ "read_xml", # GEOJSON "read_geo_data", + # PARQUET + "read_parquet", ) diff --git a/peakina/readers/parquet.py b/peakina/readers/parquet.py new file mode 100644 index 00000000..071ac85c --- /dev/null +++ b/peakina/readers/parquet.py @@ -0,0 +1,36 @@ +""" +Module to enhance pandas.read_json with JQ filter +""" + +from typing import TYPE_CHECKING, Any + +import pandas as pd +import pyarrow.dataset as ds + +if TYPE_CHECKING: + from os import PathLike + + FilePathOrBuffer = str | bytes | PathLike[str] | PathLike[bytes] + + +def read_parquet( + path_or_buf: "FilePathOrBuffer", + preview_offset: int = 0, + preview_nrows: int | None = None, + columns: list[int] | None = None, + **kwargs: Any, +) -> pd.DataFrame: + dataset = ds.dataset(source=path_or_buf, format="parquet") + indices = None + + if preview_nrows is not None: + indices = range(preview_offset, preview_offset + preview_nrows) + elif preview_offset > 0: + indices = range(preview_offset, dataset.count_rows()) + + if indices is not None: + table = dataset.take(indices=indices, columns=columns) + else: + table = dataset.to_table(columns=columns) + + return table.to_pandas() diff --git a/tests/fixtures/fixture.parquet b/tests/fixtures/fixture.parquet new file mode 100644 index 00000000..d89bfb30 Binary files /dev/null and b/tests/fixtures/fixture.parquet differ diff --git a/tests/readers/test_parquet.py b/tests/readers/test_parquet.py new file mode 100644 index 00000000..d16ef86b --- /dev/null +++ b/tests/readers/test_parquet.py @@ -0,0 +1,36 @@ +import pandas as pd + +from peakina import DataSource + + +def test_simple_parquet_preview(path): + """It should be able to get a preview of a parquet file""" + ds = DataSource(path("fixture.parquet"), reader_kwargs={"columns": ["Date", "Country"]}) + assert ds.get_df().shape == (4900, 2) + + # preview with only `nrows` + ds = DataSource( + path("fixture.parquet"), + reader_kwargs={"preview_nrows": 2, "columns": ["Date", "Country"]}, + ) + assert ds.get_df().shape == (2, 2) + assert ds.get_df().equals( + pd.DataFrame({"Date": ["29/01/1900", "31/07/1900"], "Country": ["Australia", "Croatia"]}) + ) + + # preview with `offset` and `nrows` + ds = DataSource( + path("fixture.parquet"), + reader_kwargs={"preview_nrows": 2, "preview_offset": 2, "columns": ["Date", "Country"]}, + ) + assert ds.get_df().shape == (2, 2) + assert ds.get_df().equals( + pd.DataFrame({"Date": ["21/08/1900", None], "Country": ["Usa", "Usa"]}) + ) + + # preview with only `offset` + ds = DataSource( + path("fixture.parquet"), + reader_kwargs={"preview_offset": 2, "columns": ["Date", "Country"]}, + ) + assert ds.get_df().shape == (4898, 2)