Skip to content

Commit

Permalink
feat: Update for v2.11.0
Browse files Browse the repository at this point in the history
  • Loading branch information
dangotbanned committed Nov 16, 2024
1 parent f2823b4 commit fa5bea8
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 15 deletions.
Binary file modified altair/datasets/_metadata/metadata.parquet
Binary file not shown.
19 changes: 12 additions & 7 deletions altair/datasets/_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import narwhals.stable.v1 as nw
from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT

from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read

if TYPE_CHECKING:
import json # noqa: F401
import sys
Expand Down Expand Up @@ -257,6 +259,7 @@ def __init__(self, name: _Pandas, /) -> None:
".json": pd.read_json,
".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"),
".arrow": pd.read_feather,
".parquet": pd.read_parquet,
}
self._scan_fn = {".parquet": pd.read_parquet}

Expand All @@ -274,6 +277,7 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None:
".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa),
".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa),
".arrow": partial(pd.read_feather, dtype_backend=_pa),
".parquet": partial(pd.read_parquet, dtype_backend=_pa),
}
self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)}

Expand All @@ -288,6 +292,7 @@ def __init__(self, name: _Polars, /) -> None:
".json": pl.read_json,
".tsv": partial(pl.read_csv, separator="\t"),
".arrow": pl.read_ipc,
".parquet": pl.read_parquet,
}
self._scan_fn = {".parquet": pl.scan_parquet}

Expand All @@ -304,6 +309,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None:
".json": pl.read_json,
".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True),
".arrow": partial(pl.read_ipc, use_pyarrow=True),
".parquet": partial(pl.read_parquet, use_pyarrow=True),
}
self._scan_fn = {".parquet": pl.scan_parquet}

Expand Down Expand Up @@ -378,6 +384,7 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table:
".json": pa_read_json,
".tsv": partial(pa_read_csv, parse_options=tab_sep),
".arrow": pa_read_feather,
".parquet": pa_read_parquet,
}
self._scan_fn = {".parquet": pa_read_parquet}

Expand All @@ -401,17 +408,19 @@ def validate_constraints(
name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, /
) -> Metadata:
constraints: Metadata = {}
suffixes = ".csv", ".json", ".tsv", ".arrow"
if tag is not None:
constraints["tag"] = tag
if name.endswith(suffixes):
if name.endswith(EXTENSION_SUFFIXES):
fp = Path(name)
constraints["dataset_name"] = fp.stem
constraints["suffix"] = fp.suffix
return constraints
elif suffix is not None:
if not is_ext_read(suffix):
msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}"
msg = (
f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n"
f"but got: {suffix!r}"
)
raise TypeError(msg)
else:
constraints["suffix"] = suffix
Expand All @@ -432,10 +441,6 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]:
return suffix == ".parquet"


def is_ext_read(suffix: Any) -> TypeIs[Extension]:
return suffix in {".csv", ".json", ".tsv", ".arrow"}


@overload
def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ...

Expand Down
24 changes: 21 additions & 3 deletions altair/datasets/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,32 @@
from __future__ import annotations

import sys
from typing import Literal
from typing import Any, Literal

if sys.version_info >= (3, 14):
from typing import TypedDict
else:
from typing_extensions import TypedDict

if sys.version_info >= (3, 13):
from typing import TypeIs
else:
from typing_extensions import TypeIs

if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias


__all__ = ["Dataset", "Extension", "Metadata", "Version"]
__all__ = [
"EXTENSION_SUFFIXES",
"Dataset",
"Extension",
"Metadata",
"Version",
"is_ext_read",
]

Dataset: TypeAlias = Literal[
"airports",
Expand Down Expand Up @@ -96,6 +108,7 @@
"zipcodes",
]
Version: TypeAlias = Literal[
"v2.11.0",
"v2.10.0",
"v2.9.0",
"v2.8.1",
Expand Down Expand Up @@ -140,7 +153,12 @@
"v1.7.0",
"v1.5.0",
]
Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"]
EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet")


def is_ext_read(suffix: Any) -> TypeIs[Extension]:
return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}


class Metadata(TypedDict, total=False):
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ extend-safe-fixes=[
"ANN204",
# unnecessary-dict-comprehension-for-iterable
"C420",
# unnecessary-literal-set
"C405"
]

# https://docs.astral.sh/ruff/preview/#using-rules-that-are-in-preview
Expand Down
2 changes: 1 addition & 1 deletion tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[Parame
@datasets_debug
@pytest.mark.parametrize(
("name", "suffix", "tag"),
list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})),
list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.11.0")})),
)
def test_all_datasets(
polars_loader: Loader[pl.DataFrame, pl.LazyFrame],
Expand Down
13 changes: 10 additions & 3 deletions tools/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def generate_typing(self, output: Path, /) -> None:
NAME = "Dataset"
TAG = "Version"
EXT = "Extension"
EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet"
EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES"
EXTENSION_GUARD = "is_ext_read"
METADATA_TD = "Metadata"
DESCRIPTION_DEFAULT = "_description_"
NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
Expand Down Expand Up @@ -276,14 +279,18 @@ def generate_typing(self, output: Path, /) -> None:
f"{HEADER_COMMENT}",
"from __future__ import annotations\n",
"import sys",
"from typing import Literal, TYPE_CHECKING",
"from typing import Any, Literal, TYPE_CHECKING",
utils.import_typing_extensions((3, 14), "TypedDict"),
utils.import_typing_extensions((3, 13), "TypeIs"),
utils.import_typing_extensions((3, 10), "TypeAlias"),
"\n",
f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n"
f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}",
f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}",
f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n"
f"{indent}return suffix in set({EXTENSION_TYPES!r})\n",
UNIVERSAL_TYPED_DICT.format(
name=METADATA_TD,
metaclass_kwds=", total=False",
Expand Down
Binary file modified tools/datasets/_metadata/tags.parquet
Binary file not shown.
Binary file modified tools/datasets/_metadata/tags_npm.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion tools/datasets/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@


def is_ext_supported(suffix: str) -> TypeIs[Extension]:
return suffix in {".csv", ".json", ".tsv", ".arrow"}
return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"}


def _is_str(obj: Any) -> TypeIs[str]:
Expand Down

0 comments on commit fa5bea8

Please sign in to comment.