From 81ded2557a0da17c498bab9aa40a1e5d4d7b7251 Mon Sep 17 00:00:00 2001 From: darker Date: Thu, 3 Mar 2022 13:21:57 +0100 Subject: [PATCH] fix: fix read_json for edges cases between nrows, lines=True and filter None (#99) --- peakina/readers/json.py | 30 ++++++++++++++++++------------ pyproject.toml | 2 +- tests/test_datasource.py | 7 ++++++- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/peakina/readers/json.py b/peakina/readers/json.py index 496a817d..72fa2730 100644 --- a/peakina/readers/json.py +++ b/peakina/readers/json.py @@ -29,17 +29,23 @@ def read_json( *args: Any, **kwargs: Any, ) -> pd.DataFrame: - if filter is not None: - with open(path_or_buf, encoding=encoding) as f: - path_or_buf = transform_with_jq(f.read(), filter) - - # for the preview_nrows and the preview_offset, we're going to convert in to list here - if preview_nrows is not None: - # In case we don't have the native nrows given in kwargs, we're going - # to use the provided preview_nrows - if (nrows := kwargs.get("nrows")) is None: - nrows = preview_nrows - - kwargs["nrows"] = nrows + if filter is None: + filter = "." + + with open(path_or_buf, encoding=encoding) as f: + path_or_buf = transform_with_jq(f.read(), filter) + + # In case we don't have the native nrows given in kwargs, we're going + # to use the provided preview_nrows + if (nrows := kwargs.get("nrows", preview_nrows)) is not None: + if kwargs.get("lines") and kwargs.get("lines") is True: + # cf: https://github.com/pandas-dev/pandas/blob/main/pandas/io/json/_json.py#L671 + kwargs["nrows"] = nrows + else: + data = json.loads(path_or_buf) + if isinstance(data, list): + path_or_buf = json.dumps( + data[preview_offset : nrows + preview_offset] + ) # pragma: no cover return pd.read_json(path_or_buf, encoding=encoding, *args, **kwargs) diff --git a/pyproject.toml b/pyproject.toml index 9669fdaf..f4d5c440 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "peakina" -version = "0.7.6" +version = "0.7.7" description = "pandas readers on steroids (remote files, glob patterns, cache, etc.)" authors = ["Toucan Toco "] readme = "README.md" diff --git a/tests/test_datasource.py b/tests/test_datasource.py index 3de7aae9..2a3fa362 100644 --- a/tests/test_datasource.py +++ b/tests/test_datasource.py @@ -248,7 +248,6 @@ def test_basic_json(path): df = pd.DataFrame({"@id": [1, 2], "title": ["Keep on dancin'", "Small Talk"]}) assert ds.get_df().equals(df) - jq_filter = '.records .record[] | .["@id"]|=tonumber' ds = DataSource( path("fixture.json"), reader_kwargs={"filter": jq_filter, "lines": True, "preview_nrows": 1}, @@ -256,6 +255,12 @@ def test_basic_json(path): df = pd.DataFrame({"@id": [1], "title": ["Keep on dancin'"]}) assert ds.get_df().equals(df) + ds = DataSource( + path("fixture.json"), + reader_kwargs={"preview_nrows": 1}, + ) + assert ds.get_df().shape == (1, 1) + def test_basic_parquet(path): """It should open a basic parquet file"""