Skip to content

Commit

Permalink
fix: fix read_json for edges cases between nrows, lines=True and filt…
Browse files Browse the repository at this point in the history
…er None (#99)
  • Loading branch information
Sanix-Darker authored Mar 3, 2022
1 parent b9ac540 commit 81ded25
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 14 deletions.
30 changes: 18 additions & 12 deletions peakina/readers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,23 @@ def read_json(
*args: Any,
**kwargs: Any,
) -> pd.DataFrame:
if filter is not None:
with open(path_or_buf, encoding=encoding) as f:
path_or_buf = transform_with_jq(f.read(), filter)

# for the preview_nrows and the preview_offset, we're going to convert in to list here
if preview_nrows is not None:
# In case we don't have the native nrows given in kwargs, we're going
# to use the provided preview_nrows
if (nrows := kwargs.get("nrows")) is None:
nrows = preview_nrows

kwargs["nrows"] = nrows
if filter is None:
filter = "."

with open(path_or_buf, encoding=encoding) as f:
path_or_buf = transform_with_jq(f.read(), filter)

# In case we don't have the native nrows given in kwargs, we're going
# to use the provided preview_nrows
if (nrows := kwargs.get("nrows", preview_nrows)) is not None:
if kwargs.get("lines") and kwargs.get("lines") is True:
# cf: https://github.com/pandas-dev/pandas/blob/main/pandas/io/json/_json.py#L671
kwargs["nrows"] = nrows
else:
data = json.loads(path_or_buf)
if isinstance(data, list):
path_or_buf = json.dumps(
data[preview_offset : nrows + preview_offset]
) # pragma: no cover

return pd.read_json(path_or_buf, encoding=encoding, *args, **kwargs)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "peakina"
version = "0.7.6"
version = "0.7.7"
description = "pandas readers on steroids (remote files, glob patterns, cache, etc.)"
authors = ["Toucan Toco <[email protected]>"]
readme = "README.md"
Expand Down
7 changes: 6 additions & 1 deletion tests/test_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,19 @@ def test_basic_json(path):
df = pd.DataFrame({"@id": [1, 2], "title": ["Keep on dancin'", "Small Talk"]})
assert ds.get_df().equals(df)

jq_filter = '.records .record[] | .["@id"]|=tonumber'
ds = DataSource(
path("fixture.json"),
reader_kwargs={"filter": jq_filter, "lines": True, "preview_nrows": 1},
)
df = pd.DataFrame({"@id": [1], "title": ["Keep on dancin'"]})
assert ds.get_df().equals(df)

ds = DataSource(
path("fixture.json"),
reader_kwargs={"preview_nrows": 1},
)
assert ds.get_df().shape == (1, 1)


def test_basic_parquet(path):
"""It should open a basic parquet file"""
Expand Down

0 comments on commit 81ded25

Please sign in to comment.